chromeos/dbus/media_analytics/media_perception.proto - chromium/src.git - Git at Google

 syntax = "proto2";

 option optimize_for = LITE_RUNTIME;

 package mri;

 // The output of the media analytics process. Implicitly tied to the
 // MediaPerception dictionary defined in Chromium source at
 // src/extensions/common/api/media_perception_private.idl for the
 // Chromium mediaPerceptionPrivate API. This .proto needs to be compatible
 // with the version used in the binary checked into the Chromebox For
 // Meetings private overlay.
 //
 // This message is packaged by the graph runner when a PerceptionSample
 // or array of PerceptionSamples comes out of the graph.
 message MediaPerception {
   // The timestamp attached when this data originated from the analysis process.
   optional uint64 timestamp = 1;  // In milliseconds since Epoch.

   // A single FramePerception message or array of perceptions (if reporting the
   // results from multiple frames).
   repeated FramePerception frame_perception = 2;

   // A single AudioPerception message or array of audio perceptions (if
   // reporting the results from multiple audio frames).
   repeated AudioPerception audio_perception = 3;

   // A single AudioVisualPerception message or array of audio-visual
   // perceptions.
   repeated AudioVisualPerception audio_visual_perception = 4;

   // Stores metadata such as version of media perception features.
   optional Metadata metadata = 5;
 }

 // Stores metadata such as version of media perception features.
 message Metadata {
   optional string visual_experience_controller_version = 1;
 }

 // Used to transmit a history of image frames and their associated annotations.
 // This is accumulated over time by the graph runner.
 message Diagnostics {
   repeated PerceptionSample perception_sample = 1;
 }

 message VideoStreamParam {
   // Identifies the video stream described by these parameters.
   optional string id = 1;

   // Frame width in pixels.
   optional int32 width = 2;

   // Frame height in pixels.
   optional int32 height = 3;

   // The frame rate at which this video stream would be processed.
   optional float frame_rate = 4;
 }

 message State {
   enum Status {
     STATUS_UNSPECIFIED = 0;  // Unused required default value for Proto enums.
     UNINITIALIZED = 1;  // Media analytics working on loading configuration.
     STARTED = 2;        // Analysis process running but not recieving frames.
     RUNNING = 3;        // Analysis process running and injesting frames.
     SUSPENDED = 4;      // Media analytics process waiting to be started.
     RESTARTING = 5;     // Media analytics process should be restarted.
     STOPPED = 6;        // Media analytics process should be stopped.
   }

   // Note: RUNNING and SUSPENDED are the only two states which should be sent to
   // SetState.
   optional Status status = 1;

   // Device context so that the media analytics process can better select the
   // right video device to open.
   optional string device_context = 2;

   // A list of video streams processed by the analytics process.
   repeated VideoStreamParam video_stream_param = 3;

   // Media analytics configuration. It can only be used when setting state to
   // RUNNING.
   optional string configuration = 4;

   // Corners and aspect ratio of the whiteboard in the image frame. Should only
   // be set when setting state to running and configuration to whiteboard.
   optional Whiteboard whiteboard = 5;

   enum Feature {
     // UNSET is not a real feature value.
     FEATURE_UNSET = 0;
     FEATURE_AUTOZOOM = 1;
     FEATURE_HOTWORD_DETECTION = 2;
     FEATURE_OCCUPANCY_DETECTION = 3;
     FEATURE_EDGE_EMBEDDINGS = 4;
     FEATURE_SOFTWARE_CROPPING = 5;
   }

   // A list of enabled media perception features.
   repeated Feature features = 6;

   message NamedTemplateArgument {
     optional string name = 1;
     oneof value {
       string str = 2;
       double num = 3;
     }
   }

   // An optional list of template arguments to be substituted at
   // run-time.  Each argument present in this list will be set to the
   // specified values, others will not be changed. Furthermore, nested
   // arguments (dictionaries) are not supported.
   // CAUTION: These template args can be used to overwrite the
   // Feature flags defined above since feature flags are implemented
   // as numeric template args.
   repeated NamedTemplateArgument named_template_arguments = 7;
 }

 // This is the output of the MediaPerceptionSinkCalculator.
 message PerceptionSample {
   optional FramePerception frame_perception = 1;

   // The image frame data associated with the frame perception.
   optional ImageFrame image_frame = 2;

   optional AudioPerception audio_perception = 3;

   optional AudioVisualPerception audio_visual_perception = 4;

   // Stores metadata such as version of media perception features.
   optional Metadata metadata = 5;
 }

 // Perception results based on both audio and video inputs.
 message AudioVisualPerception {
   // A timestamp in microseconds attached when this message was generated.
   optional uint64 timestamp_us = 1;

   // Human presence detection results.
   optional AudioVisualHumanPresenceDetection
       audio_visual_human_presence_detection = 2;
 }

 // Detection of human presence based on both audio and video inputs.
 message AudioVisualHumanPresenceDetection {
   // Indicates a probability in [0, 1] interval that a human is present.
   optional double human_presence_likelihood = 1;
 }

 // Audio perception results for an audio frame.
 message AudioPerception {
   // A timestamp in microseconds attached when this message was generated.
   optional uint64 timestamp_us = 1;

   // Audio localization results for an audio frame.
   optional AudioLocalization audio_localization = 2;

   // Human presence detection results for an audio frame.
   optional AudioHumanPresenceDetection audio_human_presence_detection = 3;

   // Hotword detection results.
   optional HotwordDetection hotword_detection = 4;
 }

 // Detection of hotword in the audio stream.
 message HotwordDetection {
   enum Type {
     UNKNOWN_TYPE = 0;
     OK_GOOGLE = 1;
   }

   message Hotword {
     // Unique identifier for the hotword instance. Note that a single hotword
     // instance can span more than one audio frame. In that case a single
     // hotword instance can be reported in multiple Hotword or HotwordDetection
     // messages. Hotword messages associated with the same hotword instance will
     // have the same id.
     optional uint64 id = 1;

     // Indicates the type of this hotword.
     optional Type type = 2;

     // Id of the audio frame in which the hotword was detected.
     optional uint64 frame_id = 3;

     // Indicates the start time of this hotword in the audio frame.
     optional uint64 start_timestamp_ms = 4;

     // Indicates the end time of this hotword in the audio frame.
     optional uint64 end_timestamp_ms = 5;

     // Indicates a probability in [0, 1] interval that this hotword is present
     // in the audio frame.
     optional float confidence = 6;
   }

   repeated Hotword hotwords = 1;
 }

 // An estimate of the direction that the sound is coming from.
 message AudioLocalization {
   // An angle in radians in the horizontal plane. It roughly points to the peak
   // in the probability distribution of azimuth defined below.
   optional double azimuth_radians = 1;

   // A probability distribution for the current snapshot in time that shows the
   // likelihood of a sound source being at a particular azimuth. For example,
   // azimuthScores = [0.1, 0.2, 0.3, 0.4] means that the probability that the
   // sound is coming from an azimuth of 0, pi/2, pi, 3*pi/2 is 0.1, 0.2, 0.3 and
   // 0.4, respectively.
   repeated double azimuth_scores = 2;
 }

 // Detection of human presence close to the microphone.
 message AudioHumanPresenceDetection {
   // Indicates a probability in [0, 1] interval that a human has caused a sound
   // close to the microphone.
   optional double human_presence_likelihood = 1;

   // Estimate of the noise spectrogram.
   optional AudioSpectrogram noise_spectrogram = 2;

   // Spectrogram of an audio frame.
   optional AudioSpectrogram frame_spectrogram = 3;
 }

 // Spectrogram of an audio frame.
 message AudioSpectrogram {
   repeated double values = 1;
 }

 // This message stores the image frame along with the meta data.
 message ImageFrame {
   enum Format {
     FORMAT_UNSPECIFIED = 0;  // Unused required default value for Proto enums.
     RGB = 1;                 // Raw rgb image.
     PNG = 2;                 // PNG image.
     JPEG = 3;                // JPEG image.
   }
   optional int32 width = 1;
   optional int32 height = 2;
   optional Format format = 3;
   optional int32 data_length = 4;
   optional bytes pixel_data = 5;
 }

 // The set of computer vision metadata for an image frame.
 message FramePerception {
   optional uint64 frame_id = 1;

   optional uint32 frame_width_in_px = 2;
   optional uint32 frame_height_in_px = 3;

   // The timestamp associated with the frame (when it enters the graph).
   optional uint64 timestamp = 4;

   // The list of entities detected for this frame.
   repeated Entity entity = 5;

   // Latency measurement for a list of packet streams in drishti graph.
   repeated PacketLatency packet_latency = 6;

   // Human presence detection results for a video frame.
   optional VideoHumanPresenceDetection video_human_presence_detection = 7;

   enum PerceptionType {
     UNKNOWN_TYPE = 0;
     FACE_DETECTION = 1;
     MOTION_DETECTION = 2;
     PERSON_DETECTION = 3;
   }

   // Indicates what types of frame perception were run.
   repeated PerceptionType perception_types = 8;
 }

 // Detection of human presence close to the camera.
 message VideoHumanPresenceDetection {
   // Indicates a probability in [0, 1] interval that a human is present in the
   // video frame.
   optional double human_presence_likelihood = 1;

   // Indicates a probability in [0, 1] interval that motion has been detected
   // in the video frame.
   optional double motion_detected_likelihood = 2;

   // Type of lighting conditions.
   enum LightCondition {
     UNSPECIFIED = 0;

     // No noticeable change occurred.
     NO_CHANGE = 1;

     // Light was switched on in the room.
     TURNED_ON = 2;

     // Light was switched off in the room.
     TURNED_OFF = 3;

     // Light gradually got dimmer (for example, due to a sunset).
     DIMMER = 4;

     // Light gradually got brighter (for example, due to a sunrise).
     BRIGHTER = 5;

     // Black frame detected - the current frame contains only noise.
     BLACK_FRAME = 6;
   }

   // Indicates lighting condition in the video frame.
   optional LightCondition light_condition = 3;

   // Indicates a probability in [0, 1] interval that light condition value is
   // correct.
   optional double light_condition_likelihood = 4;
 }

 message Entity {
   // A unique id associated with the detected entity, which can be used to track
   // the entity over time.
   optional uint32 id = 1;

   enum EntityType {
     UNSPECIFIED = 0;
     FACE = 1;
     PERSON = 2;
     MOTION_REGION = 3;
     LABELED_REGION = 4;
   }

   optional EntityType type = 2;

   // Minimum box, which captures entire detected entity.
   optional BoundingBox bounding_box = 3;

   // A value for the quality of this detection.
   optional float confidence = 4;

   // Perpendicular distance (depth) from the camera plane to the entity.
   optional Distance depth = 5;

   // String label for this entity.
   optional string label = 6;
 }

 message BoundingBox {
   // The points that define the corners of a bounding box.
   optional Point top_left = 1;
   optional Point bottom_right = 2;
   // Indicates whether or not these coordinates are normalized to values between
   // 0 and 1.
   optional bool normalized = 3 [default = false];
 }

 message PacketLatency {
   // An identifier label for the packet.
   optional string label = 1;

   // Delay in microseconds with respect to a reference packet.
   optional uint64 latency_usec = 2;
 }

 message Point {
   // x represents the horizontal distance from the top left corner of the image
   // to the point.
   optional float x = 1;
   // y represents the vertical distance from the top left corner of the image to
   // the point.
   optional float y = 2;
 }

 // Generic message object to encapsulate a distance magnitude and units.
 message Distance {
   enum DistanceUnits {
     UNITS_UNSPECIFIED = 0;
     METERS = 1;
     PIXELS = 2;
   }

   optional DistanceUnits units = 1;

   optional float magnitude = 2;
 }

 // The parameters for a whiteboard in the image frame.
 message Whiteboard {
   // The top left corner of the whiteboard in the image frame.
   optional Point top_left = 1;

   // The top right corner of the whiteboard in the image frame.
   optional Point top_right = 2;

   // The bottom left corner of the whiteboard in the image frame.
   optional Point bottom_left = 3;

   // The bottom right corner of the whiteboard in the image frame.
   optional Point bottom_right = 4;

   // The physical aspect ratio of the whiteboard.
   optional float aspect_ratio = 5;
 }
	syntax = "proto2";

	option optimize_for = LITE_RUNTIME;

	package mri;

	// The output of the media analytics process. Implicitly tied to the
	// MediaPerception dictionary defined in Chromium source at
	// src/extensions/common/api/media_perception_private.idl for the
	// Chromium mediaPerceptionPrivate API. This .proto needs to be compatible
	// with the version used in the binary checked into the Chromebox For
	// Meetings private overlay.
	//
	// This message is packaged by the graph runner when a PerceptionSample
	// or array of PerceptionSamples comes out of the graph.
	message MediaPerception {
	// The timestamp attached when this data originated from the analysis process.
	optional uint64 timestamp = 1; // In milliseconds since Epoch.

	// A single FramePerception message or array of perceptions (if reporting the
	// results from multiple frames).
	repeated FramePerception frame_perception = 2;

	// A single AudioPerception message or array of audio perceptions (if
	// reporting the results from multiple audio frames).
	repeated AudioPerception audio_perception = 3;

	// A single AudioVisualPerception message or array of audio-visual
	// perceptions.
	repeated AudioVisualPerception audio_visual_perception = 4;

	// Stores metadata such as version of media perception features.
	optional Metadata metadata = 5;
	}

	// Stores metadata such as version of media perception features.
	message Metadata {
	optional string visual_experience_controller_version = 1;
	}

	// Used to transmit a history of image frames and their associated annotations.
	// This is accumulated over time by the graph runner.
	message Diagnostics {
	repeated PerceptionSample perception_sample = 1;
	}

	message VideoStreamParam {
	// Identifies the video stream described by these parameters.
	optional string id = 1;

	// Frame width in pixels.
	optional int32 width = 2;

	// Frame height in pixels.
	optional int32 height = 3;

	// The frame rate at which this video stream would be processed.
	optional float frame_rate = 4;
	}

	message State {
	enum Status {
	STATUS_UNSPECIFIED = 0; // Unused required default value for Proto enums.
	UNINITIALIZED = 1; // Media analytics working on loading configuration.
	STARTED = 2; // Analysis process running but not recieving frames.
	RUNNING = 3; // Analysis process running and injesting frames.
	SUSPENDED = 4; // Media analytics process waiting to be started.
	RESTARTING = 5; // Media analytics process should be restarted.
	STOPPED = 6; // Media analytics process should be stopped.
	}

	// Note: RUNNING and SUSPENDED are the only two states which should be sent to
	// SetState.
	optional Status status = 1;

	// Device context so that the media analytics process can better select the
	// right video device to open.
	optional string device_context = 2;

	// A list of video streams processed by the analytics process.
	repeated VideoStreamParam video_stream_param = 3;

	// Media analytics configuration. It can only be used when setting state to
	// RUNNING.
	optional string configuration = 4;

	// Corners and aspect ratio of the whiteboard in the image frame. Should only
	// be set when setting state to running and configuration to whiteboard.
	optional Whiteboard whiteboard = 5;

	enum Feature {
	// UNSET is not a real feature value.
	FEATURE_UNSET = 0;
	FEATURE_AUTOZOOM = 1;
	FEATURE_HOTWORD_DETECTION = 2;
	FEATURE_OCCUPANCY_DETECTION = 3;
	FEATURE_EDGE_EMBEDDINGS = 4;
	FEATURE_SOFTWARE_CROPPING = 5;
	}

	// A list of enabled media perception features.
	repeated Feature features = 6;

	message NamedTemplateArgument {
	optional string name = 1;
	oneof value {
	string str = 2;
	double num = 3;
	}
	}

	// An optional list of template arguments to be substituted at
	// run-time. Each argument present in this list will be set to the
	// specified values, others will not be changed. Furthermore, nested
	// arguments (dictionaries) are not supported.
	// CAUTION: These template args can be used to overwrite the
	// Feature flags defined above since feature flags are implemented
	// as numeric template args.
	repeated NamedTemplateArgument named_template_arguments = 7;
	}

	// This is the output of the MediaPerceptionSinkCalculator.
	message PerceptionSample {
	optional FramePerception frame_perception = 1;

	// The image frame data associated with the frame perception.
	optional ImageFrame image_frame = 2;

	optional AudioPerception audio_perception = 3;

	optional AudioVisualPerception audio_visual_perception = 4;

	// Stores metadata such as version of media perception features.
	optional Metadata metadata = 5;
	}

	// Perception results based on both audio and video inputs.
	message AudioVisualPerception {
	// A timestamp in microseconds attached when this message was generated.
	optional uint64 timestamp_us = 1;

	// Human presence detection results.
	optional AudioVisualHumanPresenceDetection
	audio_visual_human_presence_detection = 2;
	}

	// Detection of human presence based on both audio and video inputs.
	message AudioVisualHumanPresenceDetection {
	// Indicates a probability in [0, 1] interval that a human is present.
	optional double human_presence_likelihood = 1;
	}

	// Audio perception results for an audio frame.
	message AudioPerception {
	// A timestamp in microseconds attached when this message was generated.
	optional uint64 timestamp_us = 1;

	// Audio localization results for an audio frame.
	optional AudioLocalization audio_localization = 2;

	// Human presence detection results for an audio frame.
	optional AudioHumanPresenceDetection audio_human_presence_detection = 3;

	// Hotword detection results.
	optional HotwordDetection hotword_detection = 4;
	}

	// Detection of hotword in the audio stream.
	message HotwordDetection {
	enum Type {
	UNKNOWN_TYPE = 0;
	OK_GOOGLE = 1;
	}

	message Hotword {
	// Unique identifier for the hotword instance. Note that a single hotword
	// instance can span more than one audio frame. In that case a single
	// hotword instance can be reported in multiple Hotword or HotwordDetection
	// messages. Hotword messages associated with the same hotword instance will
	// have the same id.
	optional uint64 id = 1;

	// Indicates the type of this hotword.
	optional Type type = 2;

	// Id of the audio frame in which the hotword was detected.
	optional uint64 frame_id = 3;

	// Indicates the start time of this hotword in the audio frame.
	optional uint64 start_timestamp_ms = 4;

	// Indicates the end time of this hotword in the audio frame.
	optional uint64 end_timestamp_ms = 5;

	// Indicates a probability in [0, 1] interval that this hotword is present
	// in the audio frame.
	optional float confidence = 6;
	}

	repeated Hotword hotwords = 1;
	}

	// An estimate of the direction that the sound is coming from.
	message AudioLocalization {
	// An angle in radians in the horizontal plane. It roughly points to the peak
	// in the probability distribution of azimuth defined below.
	optional double azimuth_radians = 1;

	// A probability distribution for the current snapshot in time that shows the
	// likelihood of a sound source being at a particular azimuth. For example,
	// azimuthScores = [0.1, 0.2, 0.3, 0.4] means that the probability that the
	// sound is coming from an azimuth of 0, pi/2, pi, 3*pi/2 is 0.1, 0.2, 0.3 and
	// 0.4, respectively.
	repeated double azimuth_scores = 2;
	}

	// Detection of human presence close to the microphone.
	message AudioHumanPresenceDetection {
	// Indicates a probability in [0, 1] interval that a human has caused a sound
	// close to the microphone.
	optional double human_presence_likelihood = 1;

	// Estimate of the noise spectrogram.
	optional AudioSpectrogram noise_spectrogram = 2;

	// Spectrogram of an audio frame.
	optional AudioSpectrogram frame_spectrogram = 3;
	}

	// Spectrogram of an audio frame.
	message AudioSpectrogram {
	repeated double values = 1;
	}

	// This message stores the image frame along with the meta data.
	message ImageFrame {
	enum Format {
	FORMAT_UNSPECIFIED = 0; // Unused required default value for Proto enums.
	RGB = 1; // Raw rgb image.
	PNG = 2; // PNG image.
	JPEG = 3; // JPEG image.
	}
	optional int32 width = 1;
	optional int32 height = 2;
	optional Format format = 3;
	optional int32 data_length = 4;
	optional bytes pixel_data = 5;
	}

	// The set of computer vision metadata for an image frame.
	message FramePerception {
	optional uint64 frame_id = 1;

	optional uint32 frame_width_in_px = 2;
	optional uint32 frame_height_in_px = 3;

	// The timestamp associated with the frame (when it enters the graph).
	optional uint64 timestamp = 4;

	// The list of entities detected for this frame.
	repeated Entity entity = 5;

	// Latency measurement for a list of packet streams in drishti graph.
	repeated PacketLatency packet_latency = 6;

	// Human presence detection results for a video frame.
	optional VideoHumanPresenceDetection video_human_presence_detection = 7;

	enum PerceptionType {
	UNKNOWN_TYPE = 0;
	FACE_DETECTION = 1;
	MOTION_DETECTION = 2;
	PERSON_DETECTION = 3;
	}

	// Indicates what types of frame perception were run.
	repeated PerceptionType perception_types = 8;
	}

	// Detection of human presence close to the camera.
	message VideoHumanPresenceDetection {
	// Indicates a probability in [0, 1] interval that a human is present in the
	// video frame.
	optional double human_presence_likelihood = 1;

	// Indicates a probability in [0, 1] interval that motion has been detected
	// in the video frame.
	optional double motion_detected_likelihood = 2;

	// Type of lighting conditions.
	enum LightCondition {
	UNSPECIFIED = 0;

	// No noticeable change occurred.
	NO_CHANGE = 1;

	// Light was switched on in the room.
	TURNED_ON = 2;

	// Light was switched off in the room.
	TURNED_OFF = 3;

	// Light gradually got dimmer (for example, due to a sunset).
	DIMMER = 4;

	// Light gradually got brighter (for example, due to a sunrise).
	BRIGHTER = 5;

	// Black frame detected - the current frame contains only noise.
	BLACK_FRAME = 6;
	}

	// Indicates lighting condition in the video frame.
	optional LightCondition light_condition = 3;

	// Indicates a probability in [0, 1] interval that light condition value is
	// correct.
	optional double light_condition_likelihood = 4;
	}

	message Entity {
	// A unique id associated with the detected entity, which can be used to track
	// the entity over time.
	optional uint32 id = 1;

	enum EntityType {
	UNSPECIFIED = 0;
	FACE = 1;
	PERSON = 2;
	MOTION_REGION = 3;
	LABELED_REGION = 4;
	}

	optional EntityType type = 2;

	// Minimum box, which captures entire detected entity.
	optional BoundingBox bounding_box = 3;

	// A value for the quality of this detection.
	optional float confidence = 4;

	// Perpendicular distance (depth) from the camera plane to the entity.
	optional Distance depth = 5;

	// String label for this entity.
	optional string label = 6;
	}

	message BoundingBox {
	// The points that define the corners of a bounding box.
	optional Point top_left = 1;
	optional Point bottom_right = 2;
	// Indicates whether or not these coordinates are normalized to values between
	// 0 and 1.
	optional bool normalized = 3 [default = false];
	}

	message PacketLatency {
	// An identifier label for the packet.
	optional string label = 1;

	// Delay in microseconds with respect to a reference packet.
	optional uint64 latency_usec = 2;
	}

	message Point {
	// x represents the horizontal distance from the top left corner of the image
	// to the point.
	optional float x = 1;
	// y represents the vertical distance from the top left corner of the image to
	// the point.
	optional float y = 2;
	}

	// Generic message object to encapsulate a distance magnitude and units.
	message Distance {
	enum DistanceUnits {
	UNITS_UNSPECIFIED = 0;
	METERS = 1;
	PIXELS = 2;
	}

	optional DistanceUnits units = 1;

	optional float magnitude = 2;
	}

	// The parameters for a whiteboard in the image frame.
	message Whiteboard {
	// The top left corner of the whiteboard in the image frame.
	optional Point top_left = 1;

	// The top right corner of the whiteboard in the image frame.
	optional Point top_right = 2;

	// The bottom left corner of the whiteboard in the image frame.
	optional Point bottom_left = 3;

	// The bottom right corner of the whiteboard in the image frame.
	optional Point bottom_right = 4;

	// The physical aspect ratio of the whiteboard.
	optional float aspect_ratio = 5;
	}