crossbench: Add blink-ai benchmark for built-in on-device AI APIs

Introduces a new 'blink-ai' benchmark to automate end-to-end performance
and telemetry collection for Chrome's experimental built-in AI APIs.

Bug: 517286060

Change-Id: If39d5a862dd621c875b1928f61aa2b9c9f4bc9d6
Reviewed-on: https://chromium-review.googlesource.com/c/crossbench/+/7849981
Reviewed-by: Mikhail Khokhlov <khokhlov@google.com>
Commit-Queue: Devin Cabillo <devincabillo@google.com>
diff --git a/crossbench/action_runner/action/click.py b/crossbench/action_runner/action/click.py
index 487de93..1b62a33 100644
--- a/crossbench/action_runner/action/click.py
+++ b/crossbench/action_runner/action/click.py
@@ -102,7 +102,8 @@
 
   @override
   def supported_input_sources(self) -> tuple[InputSource, ...]:
-    return (InputSource.JS, InputSource.TOUCH, InputSource.MOUSE)
+    return (InputSource.JS, InputSource.TOUCH, InputSource.MOUSE,
+            InputSource.DRIVER)
 
   @override
   def to_json(self) -> JsonDict:
diff --git a/crossbench/action_runner/android_input_action_runner.py b/crossbench/action_runner/android_input_action_runner.py
index f440864..d87740f 100644
--- a/crossbench/action_runner/android_input_action_runner.py
+++ b/crossbench/action_runner/android_input_action_runner.py
@@ -163,6 +163,9 @@
   def click_mouse(self, action: i_action.ClickAction) -> None:
     self._click_impl(action, True)
 
+  def click_driver(self, action: i_action.ClickAction) -> None:
+    self._click_impl(action, False)
+
   def swipe(self, action: i_action.SwipeAction) -> None:
     with self.actions("SwipeAction", measure=False):
       self._swipe_impl(action.start_x, action.start_y, action.end_x,
diff --git a/crossbench/action_runner/base.py b/crossbench/action_runner/base.py
index e33bbcd..9d7ab6f 100644
--- a/crossbench/action_runner/base.py
+++ b/crossbench/action_runner/base.py
@@ -269,6 +269,8 @@
       do_click = self.click_touch
     elif input_source is InputSource.MOUSE:
       do_click = self.click_mouse
+    elif input_source is InputSource.DRIVER:
+      do_click = self.click_driver
     else:
       raise RuntimeError(f"Unsupported input source: '{input_source}'")
 
@@ -356,6 +358,27 @@
   def click_mouse(self, action: i_action.ClickAction) -> None:
     raise InputSourceNotImplementedError(self, action, action.input_source)
 
+  def click_driver(self, action: i_action.ClickAction) -> None:
+    if action.duration > dt.timedelta():
+      raise InputSourceNotImplementedError(self, action, action.input_source,
+                                           "Non-zero duration not implemented")
+    selector_config = action.position.selector
+    if not selector_config:
+      raise RuntimeError("Missing selector")
+
+    with self.actions("ClickAction (Driver)", measure=False) as actions:
+      if selector_config.wait:
+        self.wait_for_element_impl(
+            actions,
+            selector=selector_config.selector,
+            timeout=action.timeout,
+            required=selector_config.required)
+
+      self.browser.trusted_click(selector_config.selector)
+
+      if action.verify:
+        self.wait_for_element_impl(
+            actions, selector=action.verify, timeout=action.timeout)
   def scroll_js(self, action: i_action.ScrollAction) -> None:
     with self.actions("ScrollAction", measure=False) as actions:
       selector = ""
diff --git a/crossbench/benchmarks/all.py b/crossbench/benchmarks/all.py
index e2575d2..c7ae05c 100644
--- a/crossbench/benchmarks/all.py
+++ b/crossbench/benchmarks/all.py
@@ -4,6 +4,7 @@
 
 from __future__ import annotations
 
+from crossbench.benchmarks.blink_ai import BlinkAIBenchmark
 from crossbench.benchmarks.devtools_frontend.devtools_frontend_benchmark import \
     DevToolsFrontendBenchmark
 from crossbench.benchmarks.embedder import EmbedderBenchmark
@@ -35,6 +36,7 @@
 from crossbench.benchmarks.webai import WebAIBenchmark
 
 __all__ = [
+    "BlinkAIBenchmark",
     "DevToolsFrontendBenchmark",
     "EmbedderBenchmark",
     "JetStream11Benchmark",
diff --git a/crossbench/benchmarks/blink_ai/__init__.py b/crossbench/benchmarks/blink_ai/__init__.py
new file mode 100644
index 0000000..112c852
--- /dev/null
+++ b/crossbench/benchmarks/blink_ai/__init__.py
@@ -0,0 +1,7 @@
+# Copyright 2026 The Chromium Authors
+# Use of this source code is governed by a BSD-style license that can be
+# found in the LICENSE file.
+
+from __future__ import annotations
+
+from .blink_ai import BlinkAIBenchmark as BlinkAIBenchmark
diff --git a/crossbench/benchmarks/blink_ai/blink_ai.py b/crossbench/benchmarks/blink_ai/blink_ai.py
new file mode 100644
index 0000000..47f737a
--- /dev/null
+++ b/crossbench/benchmarks/blink_ai/blink_ai.py
@@ -0,0 +1,144 @@
+# Copyright 2026 The Chromium Authors
+# Use of this source code is governed by a BSD-style license that can be
+# found in the LICENSE file.
+
+from __future__ import annotations
+
+import datetime as dt
+import logging
+from typing import TYPE_CHECKING, ClassVar, Sequence
+
+from typing_extensions import override
+
+from crossbench.action_runner.action.click import ClickAction
+from crossbench.action_runner.action.position import PositionConfig
+from crossbench.benchmarks.base import PressBenchmark, \
+    PressBenchmarkStoryFilter
+from crossbench.benchmarks.loading.input_source import InputSource
+from crossbench.flags.chrome import ChromeFlags
+from crossbench.stories.press_benchmark import PressBenchmarkStory
+
+from .probe import BlinkAIProbe
+
+if TYPE_CHECKING:
+  from crossbench.benchmarks.base import VersionParts
+  from crossbench.browsers.attributes import BrowserAttributes
+  from crossbench.flags.base import Flags
+  from crossbench.runner.run import Run
+
+
+class BlinkAIStory(PressBenchmarkStory):
+  NAME: ClassVar[str] = "blink_ai"
+  URL: ClassVar[str] = "https://chromium-workloads.web.app/blink-ai/main/"
+  URL_OFFICIAL: ClassVar[str] = (
+      "https://chromium-workloads.web.app/blink-ai/main/")
+  URL_LOCAL: ClassVar[str] = "http://localhost:8000/"
+  SUBSTORIES: ClassVar[tuple[str, ...]] = ("language_model",)
+
+  @classmethod
+  @override
+  def default_story_names(cls) -> tuple[str, ...]:
+    return ("language_model",)
+
+  def __init__(self,
+               substories: Sequence[str] = (),
+               url: str | None = None) -> None:
+    if not substories:
+      substories = self.SUBSTORIES
+    super().__init__(substories=substories, url=url or self.URL)
+
+  @property
+  @override
+  def substory_duration(self) -> dt.timedelta:
+    return dt.timedelta(minutes=5)
+
+  @property
+  @override
+  def slow_duration(self) -> dt.timedelta:
+    return dt.timedelta(hours=2)
+
+  @override
+  def setup(self, run: Run) -> None:
+    with run.actions("Setup") as actions:
+      actions.show_url(self.url)
+      logging.info("Waiting for window.LanguageModel to become available...")
+      actions.wait_js_condition(
+          "return !!window.LanguageModel",
+          2.0,
+          timeout=dt.timedelta(minutes=10))
+
+  @override
+  def run(self, run: Run) -> None:
+    with run.actions("Running benchmark") as actions:
+      logging.info("Clicking #start-button to initiate AI E2E test...")
+      # Chrome's Built-in AI API strictly requires a trusted user gesture
+      # to download and compile on-device models.
+      if run.browser.attributes().is_chromium_based:
+        action = ClickAction(InputSource.DRIVER,
+                             PositionConfig.parse_str("#start-button"))
+      else:
+        action = ClickAction(InputSource.JS,
+                             PositionConfig.parse_str("#start-button"))
+      run.action_runner.click(action)
+      actions.wait_js_condition(
+          "return window.testStatus !== 'running' && "
+          "window.testStatus !== 'waiting';",
+          0.5,
+          timeout=self.slow_duration)
+
+      status = actions.js("return window.testStatus;")
+      if status == "failed":
+        raise ValueError("Blink-AI Benchmark failed")
+
+
+class BlinkAIBenchmark(PressBenchmark):
+  """
+  Benchmark runner for Chrome Built-in on-device AI APIs.
+  """
+  NAME: ClassVar[str] = "blink-ai"
+  DEFAULT_STORY_CLS = BlinkAIStory
+  PROBES: ClassVar[tuple[type[BlinkAIProbe], ...]] = (BlinkAIProbe,)
+  STORY_FILTER_CLS: ClassVar = PressBenchmarkStoryFilter
+
+  @classmethod
+  @override
+  def short_base_name(cls) -> str:
+    return "blink-ai"
+
+  @classmethod
+  @override
+  def base_name(cls) -> str:
+    return "blink-ai"
+
+  @classmethod
+  @override
+  def version(cls) -> VersionParts:
+    return ("main",)
+
+  @classmethod
+  @override
+  def extra_flags(cls, browser_attributes: BrowserAttributes) -> Flags:
+    flags: Flags = super().extra_flags(browser_attributes)
+    if not browser_attributes.is_chromium_based:
+      return flags
+
+    chrome_flags = ChromeFlags(flags)
+    logging.info("Injecting experimental built-in AI flags for Chrome...")
+    for feature in (
+        "EnableBlinkReceiverAI",
+        "LanguageModelAPI",
+        "AIPromptAPI",
+        "OnDeviceModelLitertLmBackend",
+        "OptimizationGuideOnDeviceModelMultimodal",
+        "OnDeviceModelPerformanceParams:"
+        "compatible_on_device_performance_classes/*",
+        "AIWriterAPI",
+        "AIRewriterAPI",
+        "AIPromptAPIMultimodalInput",
+        "AIPromptAPIMultimodalMultilingual",
+    ):
+      chrome_flags.features.enable(feature)
+    chrome_flags.blink_features.enable("AIResponseStreaming")
+    # Force device evaluation override to run without download gate block.
+    chrome_flags.set("--optimization-guide-force-device-evaluation-override")
+    return chrome_flags
diff --git a/crossbench/benchmarks/blink_ai/probe.py b/crossbench/benchmarks/blink_ai/probe.py
new file mode 100644
index 0000000..5f1b3d7
--- /dev/null
+++ b/crossbench/benchmarks/blink_ai/probe.py
@@ -0,0 +1,69 @@
+# Copyright 2026 The Chromium Authors
+# Use of this source code is governed by a BSD-style license that can be
+# found in the LICENSE file.
+
+from __future__ import annotations
+
+import json
+from typing import TYPE_CHECKING, ClassVar, cast
+
+from typing_extensions import override
+
+from crossbench.benchmarks.benchmark_probe import BenchmarkProbeMixin
+from crossbench.probes.json import JsonResultProbe, JsonResultProbeContext
+from crossbench.probes.metric import MetricsMerger
+
+if TYPE_CHECKING:
+  from crossbench.browsers.browser import Browser
+  from crossbench.probes.results import ProbeResult
+  from crossbench.runner.actions import Actions
+  from crossbench.runner.groups.browsers import BrowsersRunGroup
+  from crossbench.runner.groups.stories import StoriesRunGroup
+  from crossbench.runner.run import Run
+  from crossbench.types import Json
+
+
+class BlinkAIProbe(BenchmarkProbeMixin, JsonResultProbe):
+  """
+  Custom probe for Blink AI benchmark.
+  Extracts window.metrics from the browser tab.
+  """
+  NAME: ClassVar[str] = "blink_ai"
+
+  @override
+  def attach(self, browser: Browser) -> None:
+    super().attach(browser)
+    for flag in (
+        "--disable-component-update",
+        "--disable-optimization-guide-model-downloads-for-benchmarking"):
+      browser.flags.pop(flag, None)
+
+  @override
+  def create_context(self, run: Run) -> BlinkAIProbeContext:
+    return cast(BlinkAIProbeContext, super().create_context(run))
+
+  @override
+  def get_context_cls(self) -> type[BlinkAIProbeContext]:
+    return BlinkAIProbeContext
+
+  @override
+  def merge_stories(self, group: StoriesRunGroup) -> ProbeResult:
+    merged = MetricsMerger.merge_json_list(
+        repetitions_group.results[self].json
+        for repetitions_group in group.repetitions_groups)
+    return self.write_group_result(group, merged)
+
+  @override
+  def merge_browsers(self, group: BrowsersRunGroup) -> ProbeResult:
+    return self.merge_browsers_json_list(group).merge(
+        self.merge_browsers_csv_list(group))
+
+
+class BlinkAIProbeContext(JsonResultProbeContext[BlinkAIProbe]):
+  JS: ClassVar[str] = "return JSON.stringify(window.metrics || {});"
+
+  @override
+  def to_json(self, actions: Actions) -> Json:
+    if json_payload := actions.js(self.JS):
+      return json.loads(json_payload)
+    return {}
diff --git a/crossbench/benchmarks/loading/input_source.py b/crossbench/benchmarks/loading/input_source.py
index 650a7f5..a382ead 100644
--- a/crossbench/benchmarks/loading/input_source.py
+++ b/crossbench/benchmarks/loading/input_source.py
@@ -15,3 +15,7 @@
   TOUCH = ("touch", "Use the touchscreen to perform the action")
   MOUSE = ("mouse", "Use the mouse to perform the action")
   KEYBOARD = ("keyboard", "Use the keyboard to perform the action")
+  DRIVER = (
+      "driver",
+      "Use webdriver to perform the action (e.g. trusted click)",
+  )
diff --git a/crossbench/browsers/browser.py b/crossbench/browsers/browser.py
index fb29b27..94c25e1 100644
--- a/crossbench/browsers/browser.py
+++ b/crossbench/browsers/browser.py
@@ -512,6 +512,9 @@
   def switch_to_new_tab(self) -> None:
     raise NotImplementedError(f"New tab is not supported by {self}")
 
+  def trusted_click(self, selector: str) -> None:
+    raise NotImplementedError(f"Trusted click is not supported by {self}")
+
   def screenshot(self, path: pth.LocalPath) -> None:
     # TODO: implement screenshot on browser and platform.
     raise NotImplementedError(f"Taking screenshots is not supported by {self}")
diff --git a/crossbench/browsers/webdriver.py b/crossbench/browsers/webdriver.py
index 72df942..0364aac 100644
--- a/crossbench/browsers/webdriver.py
+++ b/crossbench/browsers/webdriver.py
@@ -16,6 +16,7 @@
 import selenium.common.exceptions
 import urllib3
 from selenium import webdriver
+from selenium.webdriver.common.by import By
 from typing_extensions import override
 
 from crossbench import exception
@@ -283,6 +284,17 @@
   def switch_to_new_tab(self) -> None:
     self._private_driver.switch_to.new_window("tab")
 
+  def trusted_click(self, selector: str) -> None:
+    # Triggers a native click, generating a trusted user gesture in the browser.
+    logging.debug("WebDriverBrowser.trusted_click(%s)", selector)
+    assert self._is_running
+    try:
+      element = self._private_driver.find_element(By.CSS_SELECTOR, selector)
+      element.click()
+    except selenium.common.exceptions.WebDriverException as e:
+      raise DriverException(f"Failed to click element '{selector}': {e.msg}",
+                            self) from e
+
   @override
   def screenshot(self, path: LocalPath) -> None:
     if not self._private_driver.get_screenshot_as_file(path.as_posix()):
diff --git a/crossbench/cli/cli.py b/crossbench/cli/cli.py
index cb2b9c4..d1759a2 100644
--- a/crossbench/cli/cli.py
+++ b/crossbench/cli/cli.py
@@ -135,6 +135,7 @@
 
 class CrossBenchCLI:
   BENCHMARKS: tuple[BenchmarkClass, ...] = (
+      benchmarks.BlinkAIBenchmark,
       benchmarks.BrowserStartupBenchmark,
       benchmarks.DevToolsFrontendBenchmark,
       benchmarks.EmbedderBenchmark,
diff --git a/tests/crossbench/benchmarks/blink_ai/__init__.py b/tests/crossbench/benchmarks/blink_ai/__init__.py
new file mode 100644
index 0000000..5b5036e
--- /dev/null
+++ b/tests/crossbench/benchmarks/blink_ai/__init__.py
@@ -0,0 +1,3 @@
+# Copyright 2026 The Chromium Authors
+# Use of this source code is governed by a BSD-style license that can be
+# found in the LICENSE file.
diff --git a/tests/crossbench/benchmarks/blink_ai/test_blink_ai.py b/tests/crossbench/benchmarks/blink_ai/test_blink_ai.py
new file mode 100644
index 0000000..b95bab7
--- /dev/null
+++ b/tests/crossbench/benchmarks/blink_ai/test_blink_ai.py
@@ -0,0 +1,190 @@
+# Copyright 2026 The Chromium Authors
+# Use of this source code is governed by a BSD-style license that can be
+# found in the LICENSE file.
+
+from __future__ import annotations
+
+import copy
+import json
+from typing import TYPE_CHECKING
+from unittest import mock
+
+from typing_extensions import override
+
+from crossbench.benchmarks.blink_ai.blink_ai import BlinkAIBenchmark, \
+    BlinkAIStory
+from crossbench.benchmarks.blink_ai.probe import BlinkAIProbe, \
+    BlinkAIProbeContext
+from crossbench.env.runner_env import EnvConfig, ValidationMode
+from crossbench.runner.runner import Runner
+from tests import test_helper
+from tests.crossbench.benchmarks import helper
+
+if TYPE_CHECKING:
+  from tests.crossbench.mock_browser import MockBrowser
+
+
+class BlinkAITestCase(helper.SubStoryTestCase):
+
+  @property
+  @override
+  def benchmark_cls(self):
+    return BlinkAIBenchmark
+
+  @property
+  @override
+  def story_cls(self):
+    return BlinkAIStory
+
+  @property
+  def probe_cls(self):
+    return BlinkAIProbe
+
+  @property
+  def probe_context_cls(self):
+    return BlinkAIProbeContext
+
+  def _setup_run_js_expect(self,
+                           browser: MockBrowser,
+                           probe_results: dict,
+                           status: str = "success") -> None:
+    # wait_js_condition for window.LanguageModel
+    browser.expect_js(result=True)
+    # JS click for #start-button
+    browser.expect_js(result=None)
+    # wait_js_condition for window.testStatus !== 'running'
+    browser.expect_js(result=True)
+    # window.testStatus check
+    browser.expect_js(result=status)
+    # window.metrics in probe
+    browser.expect_js(result=json.dumps(probe_results))
+
+  def test_run_default(self):
+    # Prepare stories
+    stories = self.story_cls.from_names(["language_model"])
+    benchmark = self.benchmark_cls(stories)
+    self.assertTrue(len(benchmark.describe()) > 0)
+
+    # Set up expectations for mock browsers
+    probe_results = {
+        "downloadTimeMs": 500.5,
+        "sessionCreationTimeMs": 120.5,
+        "coldTimeToFirstTokenMs": 45.2,
+        "coldTotalPromptTimeMs": 250.0,
+        "coldChunksPerSecond": 45.8,
+        "warmTimeToFirstTokenMs": [12.5, 10.2, 9.8],
+        "warmTotalPromptTimeMs": [110.0, 95.0, 92.0],
+        "warmChunksPerSecond": [50.2, 55.1, 56.3]
+    }
+
+    repetitions = 2
+    for _ in range(repetitions):
+      for browser in self.browsers:
+        self._setup_run_js_expect(browser, probe_results)
+
+    for browser in self.browsers:
+      browser.expected_js = copy.deepcopy(browser.expected_js)
+
+    runner = Runner(
+        self.out_dir,
+        self.browsers,
+        benchmark,
+        env_config=EnvConfig(),
+        env_validation_mode=ValidationMode.SKIP,
+        platform=self.platform,
+        repetitions=repetitions,
+        throw=True,
+        in_memory_result_db=True)
+
+    with mock.patch.object(self.benchmark_cls, "validate_url") as cm:
+      runner.run()
+    cm.assert_called_once()
+
+    # Verification
+    for browser in self.browsers:
+      urls = self.filter_splashscreen_urls(browser.url_list)
+      self.assertEqual(len(urls), repetitions)
+      self.assertIn(self.story_cls.URL, urls)
+      self.assertListEqual(browser.expected_js, [])
+
+  def test_run_custom_url(self):
+    custom_url = "http://test.example.com/blink_ai"
+    stories = self.story_cls.from_names(["language_model"], url=custom_url)
+    benchmark = self.benchmark_cls(stories)
+
+    probe_results = {
+        "downloadTimeMs": 0.0,
+        "sessionCreationTimeMs": 100.0,
+        "coldTimeToFirstTokenMs": 40.0,
+        "coldTotalPromptTimeMs": 200.0,
+        "coldChunksPerSecond": 50.0,
+        "warmTimeToFirstTokenMs": [10.0],
+        "warmTotalPromptTimeMs": [90.0],
+        "warmChunksPerSecond": [55.0]
+    }
+    repetitions = 1
+    for _ in range(repetitions):
+      for browser in self.browsers:
+        self._setup_run_js_expect(browser, probe_results)
+
+    for browser in self.browsers:
+      browser.expected_js = copy.deepcopy(browser.expected_js)
+
+    runner = Runner(
+        self.out_dir,
+        self.browsers,
+        benchmark,
+        env_config=EnvConfig(),
+        env_validation_mode=ValidationMode.SKIP,
+        platform=self.platform,
+        repetitions=repetitions,
+        throw=True,
+        in_memory_result_db=True)
+
+    with mock.patch.object(self.benchmark_cls, "validate_url") as cm:
+      runner.run()
+    cm.assert_called_once()
+
+    for browser in self.browsers:
+      urls = self.filter_splashscreen_urls(browser.url_list)
+      self.assertEqual(len(urls), repetitions)
+      self.assertIn(custom_url, urls)
+      self.assertListEqual(browser.expected_js, [])
+
+  def test_run_error(self):
+    stories = self.story_cls.from_names(["language_model"])
+    benchmark = self.benchmark_cls(stories)
+
+    probe_results = {}
+    repetitions = 1
+    active_browsers = self.browsers[:1]
+    for _ in range(repetitions):
+      for browser in active_browsers:
+        self._setup_run_js_expect(browser, probe_results, status="failed")
+
+    for browser in active_browsers:
+      browser.expected_js = copy.deepcopy(browser.expected_js)
+
+    runner = Runner(
+        self.out_dir,
+        active_browsers,
+        benchmark,
+        env_config=EnvConfig(),
+        env_validation_mode=ValidationMode.SKIP,
+        platform=self.platform,
+        repetitions=repetitions,
+        throw=True,
+        in_memory_result_db=True)
+
+    with mock.patch.object(self.benchmark_cls, "validate_url") as cm:
+      with self.assertRaises(ValueError) as cm_err:
+        runner.run()
+      self.assertIn("Blink-AI Benchmark failed", str(cm_err.exception))
+    cm.assert_called_once()
+
+    for browser in active_browsers:
+      self.assertListEqual(browser.expected_js, [])
+
+
+if __name__ == "__main__":
+  test_helper.run_pytest(__file__)
diff --git a/tests/crossbench/mock_browser.py b/tests/crossbench/mock_browser.py
index 83cea44..172b497 100644
--- a/tests/crossbench/mock_browser.py
+++ b/tests/crossbench/mock_browser.py
@@ -164,6 +164,12 @@
   def show_url(self, url, target: str | None = None) -> None:
     self.url_list.append(url)
 
+  def trusted_click(self, selector: str) -> None:
+    self.js(
+        "const el = document.querySelector(arguments[0]); "
+        "if (el) el.click();",
+        arguments=[selector])
+
   @override
   def current_window_id(self) -> str:
     return str(self.tab_list[-1])