Add AudioVisualPerception to MediaPerceptionPrivate API.

Bug: 808144
Change-Id: I04a25f1776b7299dfc9273108e5fc38ed8c34290
Reviewed-on: https://chromium-review.googlesource.com/897895
Commit-Queue: Wei Guan <weigua@chromium.org>
Reviewed-by: Toni Barzic <tbarzic@chromium.org>
Reviewed-by: Steven Bennetts <stevenjb@chromium.org>
Cr-Original-Commit-Position: refs/heads/master@{#536140}(cherry picked from commit 9932813b326c3d4b70e2b0ca645b12e5890cf74f)
Reviewed-on: https://chromium-review.googlesource.com/927022
Reviewed-by: Simon Que <sque@chromium.org>
Cr-Commit-Position: refs/branch-heads/3325@{#531}
Cr-Branched-From: bc084a8b5afa3744a74927344e304c02ae54189f-refs/heads/master@{#530369}
diff --git a/chromeos/dbus/proto/media_perception.proto b/chromeos/dbus/proto/media_perception.proto
index ec3e88f..b7aaed22 100644
--- a/chromeos/dbus/proto/media_perception.proto
+++ b/chromeos/dbus/proto/media_perception.proto
@@ -24,6 +24,10 @@
   // A single AudioPerception message or array of audio perceptions (if
   // reporting the results from multiple audio frames).
   repeated AudioPerception audio_perception = 3;
+
+  // A single AudioVisualPerception message or array of audio-visual
+  // perceptions.
+  repeated AudioVisualPerception audio_visual_perception = 4;
 }
 
 // Used to transmit a history of image frames and their associated annotations.
@@ -81,6 +85,24 @@
   optional ImageFrame image_frame = 2;
 
   optional AudioPerception audio_perception = 3;
+
+  optional AudioVisualPerception audio_visual_perception = 4;
+}
+
+// Perception results based on both audio and video inputs.
+message AudioVisualPerception {
+  // A timestamp in microseconds attached when this message was generated.
+  optional uint64 timestamp_us = 1;
+
+  // Human presence detection results.
+  optional AudioVisualHumanPresenceDetection
+      audio_visual_human_presence_detection = 2;
+}
+
+// Detection of human presence based on both audio and video inputs.
+message AudioVisualHumanPresenceDetection {
+  // Indicates a probability in [0, 1] interval that a human is present.
+  optional double human_presence_likelihood = 1;
 }
 
 // Audio perception results for an audio frame.
diff --git a/extensions/browser/api/media_perception_private/conversion_utils.cc b/extensions/browser/api/media_perception_private/conversion_utils.cc
index d106312..6d86f29 100644
--- a/extensions/browser/api/media_perception_private/conversion_utils.cc
+++ b/extensions/browser/api/media_perception_private/conversion_utils.cc
@@ -133,6 +133,35 @@
   return detection_result;
 }
 
+std::unique_ptr<AudioVisualHumanPresenceDetection>
+AudioVisualHumanPresenceDetectionProtoToIdl(
+    const mri::AudioVisualHumanPresenceDetection& detection) {
+  std::unique_ptr<AudioVisualHumanPresenceDetection> detection_result =
+      std::make_unique<AudioVisualHumanPresenceDetection>();
+
+  if (detection.has_human_presence_likelihood()) {
+    detection_result->human_presence_likelihood =
+        std::make_unique<double>(detection.human_presence_likelihood());
+  }
+
+  return detection_result;
+}
+
+AudioVisualPerception AudioVisualPerceptionProtoToIdl(
+    const mri::AudioVisualPerception& perception) {
+  AudioVisualPerception perception_result;
+  if (perception.has_timestamp_us()) {
+    perception_result.timestamp_us =
+        std::make_unique<double>(perception.timestamp_us());
+  }
+  if (perception.has_audio_visual_human_presence_detection()) {
+    perception_result.audio_visual_human_presence_detection =
+        AudioVisualHumanPresenceDetectionProtoToIdl(
+            perception.audio_visual_human_presence_detection());
+  }
+  return perception_result;
+}
+
 std::unique_ptr<Point> PointProtoToIdl(const mri::Point& point) {
   std::unique_ptr<Point> point_result = std::make_unique<Point>();
   if (point.has_x())
@@ -342,6 +371,11 @@
         std::make_unique<AudioPerception>(
             AudioPerceptionProtoToIdl(perception_sample.audio_perception()));
   }
+  if (perception_sample.has_audio_visual_perception()) {
+    perception_sample_result.audio_visual_perception =
+        std::make_unique<AudioVisualPerception>(AudioVisualPerceptionProtoToIdl(
+            perception_sample.audio_visual_perception()));
+  }
   return perception_sample_result;
 }
 
@@ -471,6 +505,15 @@
     }
   }
 
+  if (media_perception.audio_visual_perception_size() > 0) {
+    media_perception_result.audio_visual_perceptions =
+        std::make_unique<std::vector<AudioVisualPerception>>();
+    for (const auto& perception : media_perception.audio_visual_perception()) {
+      media_perception_result.audio_visual_perceptions->emplace_back(
+          AudioVisualPerceptionProtoToIdl(perception));
+    }
+  }
+
   return media_perception_result;
 }
 
diff --git a/extensions/browser/api/media_perception_private/conversion_utils_unittest.cc b/extensions/browser/api/media_perception_private/conversion_utils_unittest.cc
index 483898e0..d6b91dcd 100644
--- a/extensions/browser/api/media_perception_private/conversion_utils_unittest.cc
+++ b/extensions/browser/api/media_perception_private/conversion_utils_unittest.cc
@@ -63,6 +63,15 @@
   frame_spectrogram->add_values(0.3);
 }
 
+void InitializeFakeAudioVisualPerception(
+    mri::AudioVisualPerception* audio_visual_perception) {
+  audio_visual_perception->set_timestamp_us(91008);
+
+  mri::AudioVisualHumanPresenceDetection* detection =
+      audio_visual_perception->mutable_audio_visual_human_presence_detection();
+  detection->set_human_presence_likelihood(0.5);
+}
+
 void InitializeFakeFramePerception(const int index,
                                    mri::FramePerception* frame_perception) {
   frame_perception->set_frame_id(index);
@@ -259,6 +268,20 @@
   EXPECT_EQ(frame_spectrogram->values->at(0), 0.3);
 }
 
+void ValidateAudioVisualPerceptionResult(
+    const media_perception::AudioVisualPerception& perception_result) {
+  ASSERT_TRUE(perception_result.timestamp_us);
+  EXPECT_EQ(*perception_result.timestamp_us, 91008);
+
+  // Validate audio-visual human presence detection.
+  const media_perception::AudioVisualHumanPresenceDetection*
+      presence_detection =
+          perception_result.audio_visual_human_presence_detection.get();
+  ASSERT_TRUE(presence_detection);
+  ASSERT_TRUE(presence_detection->human_presence_likelihood);
+  EXPECT_EQ(*presence_detection->human_presence_likelihood, 0.5);
+}
+
 void InitializeFakeImageFrameData(mri::ImageFrame* image_frame) {
   image_frame->set_width(1);
   image_frame->set_height(2);
@@ -296,6 +319,9 @@
   mri::AudioPerception* audio_perception =
       media_perception.add_audio_perception();
   InitializeFakeAudioPerception(audio_perception);
+  mri::AudioVisualPerception* audio_visual_perception =
+      media_perception.add_audio_visual_perception();
+  InitializeFakeAudioVisualPerception(audio_visual_perception);
   media_perception::MediaPerception media_perception_result =
       media_perception::MediaPerceptionProtoToIdl(media_perception);
   EXPECT_EQ(*media_perception_result.timestamp, 1);
@@ -307,6 +333,8 @@
       kFrameId, media_perception_result.frame_perceptions->at(0));
   ValidateAudioPerceptionResult(
       media_perception_result.audio_perceptions->at(0));
+  ValidateAudioVisualPerceptionResult(
+      media_perception_result.audio_visual_perceptions->at(0));
 }
 
 TEST(MediaPerceptionConversionUtilsTest, DiagnosticsProtoToIdl) {
diff --git a/extensions/common/api/media_perception_private.idl b/extensions/common/api/media_perception_private.idl
index 0d196d0..ed4e9ce 100644
--- a/extensions/common/api/media_perception_private.idl
+++ b/extensions/common/api/media_perception_private.idl
@@ -298,6 +298,21 @@
     AudioHumanPresenceDetection? audioHumanPresenceDetection;
   };
 
+  // Detection of human presence based on both audio and video inputs.
+  dictionary AudioVisualHumanPresenceDetection {
+    // Indicates a probability in [0, 1] interval that a human is present.
+    double? humanPresenceLikelihood;
+  };
+
+  // Perception results based on both audio and video inputs.
+  dictionary AudioVisualPerception {
+    // A timestamp in microseconds attached when this message was generated.
+    double? timestampUs;
+
+    // Human presence detection results.
+    AudioVisualHumanPresenceDetection? audioVisualHumanPresenceDetection;
+  };
+
   dictionary MediaPerception {
     // The time the media perception data was emitted by the media processing
     // pipeline. This value will be greater than the timestamp stored within
@@ -310,6 +325,9 @@
 
     // An array of audio perceptions.
     AudioPerception[]? audioPerceptions;
+
+    // An array of audio-visual perceptions.
+    AudioVisualPerception[]? audioVisualPerceptions;
   };
 
   enum ImageFormat {
@@ -341,6 +359,9 @@
 
     // The audio perception results for an audio frame.
     AudioPerception? audioPerception;
+
+    // Perception results based on both audio and video inputs.
+    AudioVisualPerception? audioVisualPerception;
   };
 
   dictionary Diagnostics {