Add optional time per test field to Chromium results (#33387)

diff --git a/tools/wptrunner/wptrunner/formatters/chromium.py b/tools/wptrunner/wptrunner/formatters/chromium.py
index f2f9a63..5e7f4bd 100644
--- a/tools/wptrunner/wptrunner/formatters/chromium.py
+++ b/tools/wptrunner/wptrunner/formatters/chromium.py
@@ -23,9 +23,15 @@
         # A map of test status to the number of tests that had that status.
         self.num_failures_by_status = defaultdict(int)
 
-        # Start time, expressed as offset since UNIX epoch in seconds.
+        # Start time, expressed as offset since UNIX epoch in seconds. Measured
+        # from the first `suite_start` event.
         self.start_timestamp_seconds = None
 
+        # A map of test names to test start timestamps, expressed in seconds
+        # since UNIX epoch. Only contains tests that are currently running
+        # (i.e., have not received the `test_end` event).
+        self.test_starts = {}
+
         # Trie of test results. Each directory in the test name is a node in
         # the trie and the leaf contains the dict of per-test data.
         self.tests = {}
@@ -94,7 +100,7 @@
 
     def _store_test_result(self, name, actual, expected, actual_metadata,
                            messages, wpt_actual, subtest_failure,
-                           reftest_screenshots=None):
+                           duration=None, reftest_screenshots=None):
         """
         Stores the result of a single test in |self.tests|
 
@@ -105,6 +111,7 @@
         :param list messages: a list of test messages.
         :param str wpt_actual: actual status reported by wpt, may differ from |actual|.
         :param bool subtest_failure: whether this test failed because of subtests.
+        :param Optional[float] duration: time it took in seconds to run this test.
         :param Optional[list] reftest_screenshots: see executors/base.py for definition.
         """
         # The test name can contain a leading / which will produce an empty
@@ -116,6 +123,8 @@
             cur_dict = cur_dict.setdefault(name_part, {})
         cur_dict["actual"] = actual
         cur_dict["expected"] = expected
+        if duration is not None:
+            cur_dict["time"] = duration
         if subtest_failure:
             self._append_artifact(cur_dict, "wpt_subtest_failure", "true")
         if wpt_actual != actual:
@@ -197,14 +206,37 @@
             expected_statuses = " ".join(sorted(all_statsues))
         return expected_statuses
 
+    def _get_time(self, data):
+        """Get the timestamp of a message in seconds since the UNIX epoch."""
+        maybe_timestamp_millis = data.get("time")
+        if maybe_timestamp_millis is not None:
+            return float(maybe_timestamp_millis) / 1000
+        return time.time()
+
+    def _time_test(self, test_name, data):
+        """Time how long a test took to run.
+
+        :param str test_name: the name of the test to time
+        :param data: a data dictionary to extract the test end timestamp from
+        :return Optional[float]: a nonnegative duration in seconds or None if
+                                 the measurement is unavailable or invalid
+        """
+        test_start = self.test_starts.pop(test_name, None)
+        if test_start is not None:
+            # The |data| dictionary only provides millisecond resolution
+            # anyway, so further nonzero digits are unlikely to be meaningful.
+            duration = round(self._get_time(data) - test_start, 3)
+            if duration >= 0:
+                return duration
+        return None
+
     def suite_start(self, data):
-        # |data| contains a timestamp in milliseconds, while time.time() gives
-        # it in seconds.
         if self.start_timestamp_seconds is None:
-            if 'time' in data:
-                self.start_timestamp_seconds = float(data["time"]) / 1000
-            else:
-                self.start_timestamp_seconds = time.time()
+            self.start_timestamp_seconds = self._get_time(data)
+
+    def test_start(self, data):
+        test_name = data["test"]
+        self.test_starts[test_name] = self._get_time(data)
 
     def test_status(self, data):
         test_name = data["test"]
@@ -228,6 +260,7 @@
         wpt_actual_status = data["status"]
         actual_status = self._map_status_name(wpt_actual_status)
         expected_statuses = self._get_expected_status_from_data(actual_status, data)
+        duration = self._time_test(test_name, data)
         subtest_failure = False
         if test_name in self.tests_with_subtest_fails:
             subtest_failure = True
@@ -248,6 +281,7 @@
                                 self.messages[test_name],
                                 wpt_actual_status,
                                 subtest_failure,
+                                duration,
                                 data.get("extra", {}).get("reftest_screenshots"))
 
         # Remove the test from dicts to avoid accumulating too many.
diff --git a/tools/wptrunner/wptrunner/formatters/tests/test_chromium.py b/tools/wptrunner/wptrunner/formatters/tests/test_chromium.py
index 14e0788..77002d3 100644
--- a/tools/wptrunner/wptrunner/formatters/tests/test_chromium.py
+++ b/tools/wptrunner/wptrunner/formatters/tests/test_chromium.py
@@ -60,6 +60,38 @@
     assert "expected" in test_obj
 
 
+def test_time_per_test(logger, capfd):
+    # Test that the formatter measures time per test correctly.
+
+    # Set up the handler.
+    output = StringIO()
+    logger.add_handler(handlers.StreamHandler(output, ChromiumFormatter()))
+
+    # output a bunch of stuff
+    logger.suite_start(["test-id-1", "test-id-2"], run_info={}, time=50)
+    logger.test_start("test-id-1", time=100)
+    logger.test_start("test-id-2", time=200)
+    logger.test_end("test-id-1", status="PASS", expected="PASS", time=300)
+    logger.test_end("test-id-2", status="PASS", expected="PASS", time=199)
+    logger.suite_end()
+    logger.shutdown()
+
+    # check nothing got output to stdout/stderr
+    # (note that mozlog outputs exceptions during handling to stderr!)
+    captured = capfd.readouterr()
+    assert captured.out == ""
+    assert captured.err == ""
+
+    # check the actual output of the formatter
+    output.seek(0)
+    output_obj = json.load(output)
+
+    test1_obj = output_obj["tests"]["test-id-1"]
+    test2_obj = output_obj["tests"]["test-id-2"]
+    assert pytest.approx(test1_obj["time"]) == 0.2  # 300ms - 100ms = 0.2s
+    assert "time" not in test2_obj
+
+
 def test_chromium_test_name_trie(logger, capfd):
     # Ensure test names are broken into directories and stored in a trie with
     # test results at the leaves.