Add AudioVisualPerception to MediaPerceptionPrivate API.
Bug: 808144
Change-Id: I04a25f1776b7299dfc9273108e5fc38ed8c34290
Reviewed-on: https://chromium-review.googlesource.com/897895
Commit-Queue: Wei Guan <weigua@chromium.org>
Reviewed-by: Toni Barzic <tbarzic@chromium.org>
Reviewed-by: Steven Bennetts <stevenjb@chromium.org>
Cr-Original-Commit-Position: refs/heads/master@{#536140}(cherry picked from commit 9932813b326c3d4b70e2b0ca645b12e5890cf74f)
Reviewed-on: https://chromium-review.googlesource.com/927022
Reviewed-by: Simon Que <sque@chromium.org>
Cr-Commit-Position: refs/branch-heads/3325@{#531}
Cr-Branched-From: bc084a8b5afa3744a74927344e304c02ae54189f-refs/heads/master@{#530369}
diff --git a/chromeos/dbus/proto/media_perception.proto b/chromeos/dbus/proto/media_perception.proto
index ec3e88f..b7aaed22 100644
--- a/chromeos/dbus/proto/media_perception.proto
+++ b/chromeos/dbus/proto/media_perception.proto
@@ -24,6 +24,10 @@
// A single AudioPerception message or array of audio perceptions (if
// reporting the results from multiple audio frames).
repeated AudioPerception audio_perception = 3;
+
+ // A single AudioVisualPerception message or array of audio-visual
+ // perceptions.
+ repeated AudioVisualPerception audio_visual_perception = 4;
}
// Used to transmit a history of image frames and their associated annotations.
@@ -81,6 +85,24 @@
optional ImageFrame image_frame = 2;
optional AudioPerception audio_perception = 3;
+
+ optional AudioVisualPerception audio_visual_perception = 4;
+}
+
+// Perception results based on both audio and video inputs.
+message AudioVisualPerception {
+ // A timestamp in microseconds attached when this message was generated.
+ optional uint64 timestamp_us = 1;
+
+ // Human presence detection results.
+ optional AudioVisualHumanPresenceDetection
+ audio_visual_human_presence_detection = 2;
+}
+
+// Detection of human presence based on both audio and video inputs.
+message AudioVisualHumanPresenceDetection {
+ // Indicates a probability in [0, 1] interval that a human is present.
+ optional double human_presence_likelihood = 1;
}
// Audio perception results for an audio frame.
diff --git a/extensions/browser/api/media_perception_private/conversion_utils.cc b/extensions/browser/api/media_perception_private/conversion_utils.cc
index d106312..6d86f29 100644
--- a/extensions/browser/api/media_perception_private/conversion_utils.cc
+++ b/extensions/browser/api/media_perception_private/conversion_utils.cc
@@ -133,6 +133,35 @@
return detection_result;
}
+std::unique_ptr<AudioVisualHumanPresenceDetection>
+AudioVisualHumanPresenceDetectionProtoToIdl(
+ const mri::AudioVisualHumanPresenceDetection& detection) {
+ std::unique_ptr<AudioVisualHumanPresenceDetection> detection_result =
+ std::make_unique<AudioVisualHumanPresenceDetection>();
+
+ if (detection.has_human_presence_likelihood()) {
+ detection_result->human_presence_likelihood =
+ std::make_unique<double>(detection.human_presence_likelihood());
+ }
+
+ return detection_result;
+}
+
+AudioVisualPerception AudioVisualPerceptionProtoToIdl(
+ const mri::AudioVisualPerception& perception) {
+ AudioVisualPerception perception_result;
+ if (perception.has_timestamp_us()) {
+ perception_result.timestamp_us =
+ std::make_unique<double>(perception.timestamp_us());
+ }
+ if (perception.has_audio_visual_human_presence_detection()) {
+ perception_result.audio_visual_human_presence_detection =
+ AudioVisualHumanPresenceDetectionProtoToIdl(
+ perception.audio_visual_human_presence_detection());
+ }
+ return perception_result;
+}
+
std::unique_ptr<Point> PointProtoToIdl(const mri::Point& point) {
std::unique_ptr<Point> point_result = std::make_unique<Point>();
if (point.has_x())
@@ -342,6 +371,11 @@
std::make_unique<AudioPerception>(
AudioPerceptionProtoToIdl(perception_sample.audio_perception()));
}
+ if (perception_sample.has_audio_visual_perception()) {
+ perception_sample_result.audio_visual_perception =
+ std::make_unique<AudioVisualPerception>(AudioVisualPerceptionProtoToIdl(
+ perception_sample.audio_visual_perception()));
+ }
return perception_sample_result;
}
@@ -471,6 +505,15 @@
}
}
+ if (media_perception.audio_visual_perception_size() > 0) {
+ media_perception_result.audio_visual_perceptions =
+ std::make_unique<std::vector<AudioVisualPerception>>();
+ for (const auto& perception : media_perception.audio_visual_perception()) {
+ media_perception_result.audio_visual_perceptions->emplace_back(
+ AudioVisualPerceptionProtoToIdl(perception));
+ }
+ }
+
return media_perception_result;
}
diff --git a/extensions/browser/api/media_perception_private/conversion_utils_unittest.cc b/extensions/browser/api/media_perception_private/conversion_utils_unittest.cc
index 483898e0..d6b91dcd 100644
--- a/extensions/browser/api/media_perception_private/conversion_utils_unittest.cc
+++ b/extensions/browser/api/media_perception_private/conversion_utils_unittest.cc
@@ -63,6 +63,15 @@
frame_spectrogram->add_values(0.3);
}
+void InitializeFakeAudioVisualPerception(
+ mri::AudioVisualPerception* audio_visual_perception) {
+ audio_visual_perception->set_timestamp_us(91008);
+
+ mri::AudioVisualHumanPresenceDetection* detection =
+ audio_visual_perception->mutable_audio_visual_human_presence_detection();
+ detection->set_human_presence_likelihood(0.5);
+}
+
void InitializeFakeFramePerception(const int index,
mri::FramePerception* frame_perception) {
frame_perception->set_frame_id(index);
@@ -259,6 +268,20 @@
EXPECT_EQ(frame_spectrogram->values->at(0), 0.3);
}
+void ValidateAudioVisualPerceptionResult(
+ const media_perception::AudioVisualPerception& perception_result) {
+ ASSERT_TRUE(perception_result.timestamp_us);
+ EXPECT_EQ(*perception_result.timestamp_us, 91008);
+
+ // Validate audio-visual human presence detection.
+ const media_perception::AudioVisualHumanPresenceDetection*
+ presence_detection =
+ perception_result.audio_visual_human_presence_detection.get();
+ ASSERT_TRUE(presence_detection);
+ ASSERT_TRUE(presence_detection->human_presence_likelihood);
+ EXPECT_EQ(*presence_detection->human_presence_likelihood, 0.5);
+}
+
void InitializeFakeImageFrameData(mri::ImageFrame* image_frame) {
image_frame->set_width(1);
image_frame->set_height(2);
@@ -296,6 +319,9 @@
mri::AudioPerception* audio_perception =
media_perception.add_audio_perception();
InitializeFakeAudioPerception(audio_perception);
+ mri::AudioVisualPerception* audio_visual_perception =
+ media_perception.add_audio_visual_perception();
+ InitializeFakeAudioVisualPerception(audio_visual_perception);
media_perception::MediaPerception media_perception_result =
media_perception::MediaPerceptionProtoToIdl(media_perception);
EXPECT_EQ(*media_perception_result.timestamp, 1);
@@ -307,6 +333,8 @@
kFrameId, media_perception_result.frame_perceptions->at(0));
ValidateAudioPerceptionResult(
media_perception_result.audio_perceptions->at(0));
+ ValidateAudioVisualPerceptionResult(
+ media_perception_result.audio_visual_perceptions->at(0));
}
TEST(MediaPerceptionConversionUtilsTest, DiagnosticsProtoToIdl) {
diff --git a/extensions/common/api/media_perception_private.idl b/extensions/common/api/media_perception_private.idl
index 0d196d0..ed4e9ce 100644
--- a/extensions/common/api/media_perception_private.idl
+++ b/extensions/common/api/media_perception_private.idl
@@ -298,6 +298,21 @@
AudioHumanPresenceDetection? audioHumanPresenceDetection;
};
+ // Detection of human presence based on both audio and video inputs.
+ dictionary AudioVisualHumanPresenceDetection {
+ // Indicates a probability in [0, 1] interval that a human is present.
+ double? humanPresenceLikelihood;
+ };
+
+ // Perception results based on both audio and video inputs.
+ dictionary AudioVisualPerception {
+ // A timestamp in microseconds attached when this message was generated.
+ double? timestampUs;
+
+ // Human presence detection results.
+ AudioVisualHumanPresenceDetection? audioVisualHumanPresenceDetection;
+ };
+
dictionary MediaPerception {
// The time the media perception data was emitted by the media processing
// pipeline. This value will be greater than the timestamp stored within
@@ -310,6 +325,9 @@
// An array of audio perceptions.
AudioPerception[]? audioPerceptions;
+
+ // An array of audio-visual perceptions.
+ AudioVisualPerception[]? audioVisualPerceptions;
};
enum ImageFormat {
@@ -341,6 +359,9 @@
// The audio perception results for an audio frame.
AudioPerception? audioPerception;
+
+ // Perception results based on both audio and video inputs.
+ AudioVisualPerception? audioVisualPerception;
};
dictionary Diagnostics {