crossbench: Add blink-ai benchmark for built-in on-device AI APIs Introduces a new 'blink-ai' benchmark to automate end-to-end performance and telemetry collection for Chrome's experimental built-in AI APIs. Bug: 517286060 Change-Id: If39d5a862dd621c875b1928f61aa2b9c9f4bc9d6 Reviewed-on: https://chromium-review.googlesource.com/c/crossbench/+/7849981 Reviewed-by: Mikhail Khokhlov <khokhlov@google.com> Commit-Queue: Devin Cabillo <devincabillo@google.com>
diff --git a/crossbench/action_runner/action/click.py b/crossbench/action_runner/action/click.py index 487de93..1b62a33 100644 --- a/crossbench/action_runner/action/click.py +++ b/crossbench/action_runner/action/click.py
@@ -102,7 +102,8 @@ @override def supported_input_sources(self) -> tuple[InputSource, ...]: - return (InputSource.JS, InputSource.TOUCH, InputSource.MOUSE) + return (InputSource.JS, InputSource.TOUCH, InputSource.MOUSE, + InputSource.DRIVER) @override def to_json(self) -> JsonDict:
diff --git a/crossbench/action_runner/android_input_action_runner.py b/crossbench/action_runner/android_input_action_runner.py index f440864..d87740f 100644 --- a/crossbench/action_runner/android_input_action_runner.py +++ b/crossbench/action_runner/android_input_action_runner.py
@@ -163,6 +163,9 @@ def click_mouse(self, action: i_action.ClickAction) -> None: self._click_impl(action, True) + def click_driver(self, action: i_action.ClickAction) -> None: + self._click_impl(action, False) + def swipe(self, action: i_action.SwipeAction) -> None: with self.actions("SwipeAction", measure=False): self._swipe_impl(action.start_x, action.start_y, action.end_x,
diff --git a/crossbench/action_runner/base.py b/crossbench/action_runner/base.py index e33bbcd..9d7ab6f 100644 --- a/crossbench/action_runner/base.py +++ b/crossbench/action_runner/base.py
@@ -269,6 +269,8 @@ do_click = self.click_touch elif input_source is InputSource.MOUSE: do_click = self.click_mouse + elif input_source is InputSource.DRIVER: + do_click = self.click_driver else: raise RuntimeError(f"Unsupported input source: '{input_source}'") @@ -356,6 +358,27 @@ def click_mouse(self, action: i_action.ClickAction) -> None: raise InputSourceNotImplementedError(self, action, action.input_source) + def click_driver(self, action: i_action.ClickAction) -> None: + if action.duration > dt.timedelta(): + raise InputSourceNotImplementedError(self, action, action.input_source, + "Non-zero duration not implemented") + selector_config = action.position.selector + if not selector_config: + raise RuntimeError("Missing selector") + + with self.actions("ClickAction (Driver)", measure=False) as actions: + if selector_config.wait: + self.wait_for_element_impl( + actions, + selector=selector_config.selector, + timeout=action.timeout, + required=selector_config.required) + + self.browser.trusted_click(selector_config.selector) + + if action.verify: + self.wait_for_element_impl( + actions, selector=action.verify, timeout=action.timeout) def scroll_js(self, action: i_action.ScrollAction) -> None: with self.actions("ScrollAction", measure=False) as actions: selector = ""
diff --git a/crossbench/benchmarks/all.py b/crossbench/benchmarks/all.py index e2575d2..c7ae05c 100644 --- a/crossbench/benchmarks/all.py +++ b/crossbench/benchmarks/all.py
@@ -4,6 +4,7 @@ from __future__ import annotations +from crossbench.benchmarks.blink_ai import BlinkAIBenchmark from crossbench.benchmarks.devtools_frontend.devtools_frontend_benchmark import \ DevToolsFrontendBenchmark from crossbench.benchmarks.embedder import EmbedderBenchmark @@ -35,6 +36,7 @@ from crossbench.benchmarks.webai import WebAIBenchmark __all__ = [ + "BlinkAIBenchmark", "DevToolsFrontendBenchmark", "EmbedderBenchmark", "JetStream11Benchmark",
diff --git a/crossbench/benchmarks/blink_ai/__init__.py b/crossbench/benchmarks/blink_ai/__init__.py new file mode 100644 index 0000000..112c852 --- /dev/null +++ b/crossbench/benchmarks/blink_ai/__init__.py
@@ -0,0 +1,7 @@ +# Copyright 2026 The Chromium Authors +# Use of this source code is governed by a BSD-style license that can be +# found in the LICENSE file. + +from __future__ import annotations + +from .blink_ai import BlinkAIBenchmark as BlinkAIBenchmark
diff --git a/crossbench/benchmarks/blink_ai/blink_ai.py b/crossbench/benchmarks/blink_ai/blink_ai.py new file mode 100644 index 0000000..47f737a --- /dev/null +++ b/crossbench/benchmarks/blink_ai/blink_ai.py
@@ -0,0 +1,144 @@ +# Copyright 2026 The Chromium Authors +# Use of this source code is governed by a BSD-style license that can be +# found in the LICENSE file. + +from __future__ import annotations + +import datetime as dt +import logging +from typing import TYPE_CHECKING, ClassVar, Sequence + +from typing_extensions import override + +from crossbench.action_runner.action.click import ClickAction +from crossbench.action_runner.action.position import PositionConfig +from crossbench.benchmarks.base import PressBenchmark, \ + PressBenchmarkStoryFilter +from crossbench.benchmarks.loading.input_source import InputSource +from crossbench.flags.chrome import ChromeFlags +from crossbench.stories.press_benchmark import PressBenchmarkStory + +from .probe import BlinkAIProbe + +if TYPE_CHECKING: + from crossbench.benchmarks.base import VersionParts + from crossbench.browsers.attributes import BrowserAttributes + from crossbench.flags.base import Flags + from crossbench.runner.run import Run + + +class BlinkAIStory(PressBenchmarkStory): + NAME: ClassVar[str] = "blink_ai" + URL: ClassVar[str] = "https://chromium-workloads.web.app/blink-ai/main/" + URL_OFFICIAL: ClassVar[str] = ( + "https://chromium-workloads.web.app/blink-ai/main/") + URL_LOCAL: ClassVar[str] = "http://localhost:8000/" + SUBSTORIES: ClassVar[tuple[str, ...]] = ("language_model",) + + @classmethod + @override + def default_story_names(cls) -> tuple[str, ...]: + return ("language_model",) + + def __init__(self, + substories: Sequence[str] = (), + url: str | None = None) -> None: + if not substories: + substories = self.SUBSTORIES + super().__init__(substories=substories, url=url or self.URL) + + @property + @override + def substory_duration(self) -> dt.timedelta: + return dt.timedelta(minutes=5) + + @property + @override + def slow_duration(self) -> dt.timedelta: + return dt.timedelta(hours=2) + + @override + def setup(self, run: Run) -> None: + with run.actions("Setup") as actions: + actions.show_url(self.url) + logging.info("Waiting for window.LanguageModel to become available...") + actions.wait_js_condition( + "return !!window.LanguageModel", + 2.0, + timeout=dt.timedelta(minutes=10)) + + @override + def run(self, run: Run) -> None: + with run.actions("Running benchmark") as actions: + logging.info("Clicking #start-button to initiate AI E2E test...") + # Chrome's Built-in AI API strictly requires a trusted user gesture + # to download and compile on-device models. + if run.browser.attributes().is_chromium_based: + action = ClickAction(InputSource.DRIVER, + PositionConfig.parse_str("#start-button")) + else: + action = ClickAction(InputSource.JS, + PositionConfig.parse_str("#start-button")) + run.action_runner.click(action) + actions.wait_js_condition( + "return window.testStatus !== 'running' && " + "window.testStatus !== 'waiting';", + 0.5, + timeout=self.slow_duration) + + status = actions.js("return window.testStatus;") + if status == "failed": + raise ValueError("Blink-AI Benchmark failed") + + +class BlinkAIBenchmark(PressBenchmark): + """ + Benchmark runner for Chrome Built-in on-device AI APIs. + """ + NAME: ClassVar[str] = "blink-ai" + DEFAULT_STORY_CLS = BlinkAIStory + PROBES: ClassVar[tuple[type[BlinkAIProbe], ...]] = (BlinkAIProbe,) + STORY_FILTER_CLS: ClassVar = PressBenchmarkStoryFilter + + @classmethod + @override + def short_base_name(cls) -> str: + return "blink-ai" + + @classmethod + @override + def base_name(cls) -> str: + return "blink-ai" + + @classmethod + @override + def version(cls) -> VersionParts: + return ("main",) + + @classmethod + @override + def extra_flags(cls, browser_attributes: BrowserAttributes) -> Flags: + flags: Flags = super().extra_flags(browser_attributes) + if not browser_attributes.is_chromium_based: + return flags + + chrome_flags = ChromeFlags(flags) + logging.info("Injecting experimental built-in AI flags for Chrome...") + for feature in ( + "EnableBlinkReceiverAI", + "LanguageModelAPI", + "AIPromptAPI", + "OnDeviceModelLitertLmBackend", + "OptimizationGuideOnDeviceModelMultimodal", + "OnDeviceModelPerformanceParams:" + "compatible_on_device_performance_classes/*", + "AIWriterAPI", + "AIRewriterAPI", + "AIPromptAPIMultimodalInput", + "AIPromptAPIMultimodalMultilingual", + ): + chrome_flags.features.enable(feature) + chrome_flags.blink_features.enable("AIResponseStreaming") + # Force device evaluation override to run without download gate block. + chrome_flags.set("--optimization-guide-force-device-evaluation-override") + return chrome_flags
diff --git a/crossbench/benchmarks/blink_ai/probe.py b/crossbench/benchmarks/blink_ai/probe.py new file mode 100644 index 0000000..5f1b3d7 --- /dev/null +++ b/crossbench/benchmarks/blink_ai/probe.py
@@ -0,0 +1,69 @@ +# Copyright 2026 The Chromium Authors +# Use of this source code is governed by a BSD-style license that can be +# found in the LICENSE file. + +from __future__ import annotations + +import json +from typing import TYPE_CHECKING, ClassVar, cast + +from typing_extensions import override + +from crossbench.benchmarks.benchmark_probe import BenchmarkProbeMixin +from crossbench.probes.json import JsonResultProbe, JsonResultProbeContext +from crossbench.probes.metric import MetricsMerger + +if TYPE_CHECKING: + from crossbench.browsers.browser import Browser + from crossbench.probes.results import ProbeResult + from crossbench.runner.actions import Actions + from crossbench.runner.groups.browsers import BrowsersRunGroup + from crossbench.runner.groups.stories import StoriesRunGroup + from crossbench.runner.run import Run + from crossbench.types import Json + + +class BlinkAIProbe(BenchmarkProbeMixin, JsonResultProbe): + """ + Custom probe for Blink AI benchmark. + Extracts window.metrics from the browser tab. + """ + NAME: ClassVar[str] = "blink_ai" + + @override + def attach(self, browser: Browser) -> None: + super().attach(browser) + for flag in ( + "--disable-component-update", + "--disable-optimization-guide-model-downloads-for-benchmarking"): + browser.flags.pop(flag, None) + + @override + def create_context(self, run: Run) -> BlinkAIProbeContext: + return cast(BlinkAIProbeContext, super().create_context(run)) + + @override + def get_context_cls(self) -> type[BlinkAIProbeContext]: + return BlinkAIProbeContext + + @override + def merge_stories(self, group: StoriesRunGroup) -> ProbeResult: + merged = MetricsMerger.merge_json_list( + repetitions_group.results[self].json + for repetitions_group in group.repetitions_groups) + return self.write_group_result(group, merged) + + @override + def merge_browsers(self, group: BrowsersRunGroup) -> ProbeResult: + return self.merge_browsers_json_list(group).merge( + self.merge_browsers_csv_list(group)) + + +class BlinkAIProbeContext(JsonResultProbeContext[BlinkAIProbe]): + JS: ClassVar[str] = "return JSON.stringify(window.metrics || {});" + + @override + def to_json(self, actions: Actions) -> Json: + if json_payload := actions.js(self.JS): + return json.loads(json_payload) + return {}
diff --git a/crossbench/benchmarks/loading/input_source.py b/crossbench/benchmarks/loading/input_source.py index 650a7f5..a382ead 100644 --- a/crossbench/benchmarks/loading/input_source.py +++ b/crossbench/benchmarks/loading/input_source.py
@@ -15,3 +15,7 @@ TOUCH = ("touch", "Use the touchscreen to perform the action") MOUSE = ("mouse", "Use the mouse to perform the action") KEYBOARD = ("keyboard", "Use the keyboard to perform the action") + DRIVER = ( + "driver", + "Use webdriver to perform the action (e.g. trusted click)", + )
diff --git a/crossbench/browsers/browser.py b/crossbench/browsers/browser.py index fb29b27..94c25e1 100644 --- a/crossbench/browsers/browser.py +++ b/crossbench/browsers/browser.py
@@ -512,6 +512,9 @@ def switch_to_new_tab(self) -> None: raise NotImplementedError(f"New tab is not supported by {self}") + def trusted_click(self, selector: str) -> None: + raise NotImplementedError(f"Trusted click is not supported by {self}") + def screenshot(self, path: pth.LocalPath) -> None: # TODO: implement screenshot on browser and platform. raise NotImplementedError(f"Taking screenshots is not supported by {self}")
diff --git a/crossbench/browsers/webdriver.py b/crossbench/browsers/webdriver.py index 72df942..0364aac 100644 --- a/crossbench/browsers/webdriver.py +++ b/crossbench/browsers/webdriver.py
@@ -16,6 +16,7 @@ import selenium.common.exceptions import urllib3 from selenium import webdriver +from selenium.webdriver.common.by import By from typing_extensions import override from crossbench import exception @@ -283,6 +284,17 @@ def switch_to_new_tab(self) -> None: self._private_driver.switch_to.new_window("tab") + def trusted_click(self, selector: str) -> None: + # Triggers a native click, generating a trusted user gesture in the browser. + logging.debug("WebDriverBrowser.trusted_click(%s)", selector) + assert self._is_running + try: + element = self._private_driver.find_element(By.CSS_SELECTOR, selector) + element.click() + except selenium.common.exceptions.WebDriverException as e: + raise DriverException(f"Failed to click element '{selector}': {e.msg}", + self) from e + @override def screenshot(self, path: LocalPath) -> None: if not self._private_driver.get_screenshot_as_file(path.as_posix()):
diff --git a/crossbench/cli/cli.py b/crossbench/cli/cli.py index cb2b9c4..d1759a2 100644 --- a/crossbench/cli/cli.py +++ b/crossbench/cli/cli.py
@@ -135,6 +135,7 @@ class CrossBenchCLI: BENCHMARKS: tuple[BenchmarkClass, ...] = ( + benchmarks.BlinkAIBenchmark, benchmarks.BrowserStartupBenchmark, benchmarks.DevToolsFrontendBenchmark, benchmarks.EmbedderBenchmark,
diff --git a/tests/crossbench/benchmarks/blink_ai/__init__.py b/tests/crossbench/benchmarks/blink_ai/__init__.py new file mode 100644 index 0000000..5b5036e --- /dev/null +++ b/tests/crossbench/benchmarks/blink_ai/__init__.py
@@ -0,0 +1,3 @@ +# Copyright 2026 The Chromium Authors +# Use of this source code is governed by a BSD-style license that can be +# found in the LICENSE file.
diff --git a/tests/crossbench/benchmarks/blink_ai/test_blink_ai.py b/tests/crossbench/benchmarks/blink_ai/test_blink_ai.py new file mode 100644 index 0000000..b95bab7 --- /dev/null +++ b/tests/crossbench/benchmarks/blink_ai/test_blink_ai.py
@@ -0,0 +1,190 @@ +# Copyright 2026 The Chromium Authors +# Use of this source code is governed by a BSD-style license that can be +# found in the LICENSE file. + +from __future__ import annotations + +import copy +import json +from typing import TYPE_CHECKING +from unittest import mock + +from typing_extensions import override + +from crossbench.benchmarks.blink_ai.blink_ai import BlinkAIBenchmark, \ + BlinkAIStory +from crossbench.benchmarks.blink_ai.probe import BlinkAIProbe, \ + BlinkAIProbeContext +from crossbench.env.runner_env import EnvConfig, ValidationMode +from crossbench.runner.runner import Runner +from tests import test_helper +from tests.crossbench.benchmarks import helper + +if TYPE_CHECKING: + from tests.crossbench.mock_browser import MockBrowser + + +class BlinkAITestCase(helper.SubStoryTestCase): + + @property + @override + def benchmark_cls(self): + return BlinkAIBenchmark + + @property + @override + def story_cls(self): + return BlinkAIStory + + @property + def probe_cls(self): + return BlinkAIProbe + + @property + def probe_context_cls(self): + return BlinkAIProbeContext + + def _setup_run_js_expect(self, + browser: MockBrowser, + probe_results: dict, + status: str = "success") -> None: + # wait_js_condition for window.LanguageModel + browser.expect_js(result=True) + # JS click for #start-button + browser.expect_js(result=None) + # wait_js_condition for window.testStatus !== 'running' + browser.expect_js(result=True) + # window.testStatus check + browser.expect_js(result=status) + # window.metrics in probe + browser.expect_js(result=json.dumps(probe_results)) + + def test_run_default(self): + # Prepare stories + stories = self.story_cls.from_names(["language_model"]) + benchmark = self.benchmark_cls(stories) + self.assertTrue(len(benchmark.describe()) > 0) + + # Set up expectations for mock browsers + probe_results = { + "downloadTimeMs": 500.5, + "sessionCreationTimeMs": 120.5, + "coldTimeToFirstTokenMs": 45.2, + "coldTotalPromptTimeMs": 250.0, + "coldChunksPerSecond": 45.8, + "warmTimeToFirstTokenMs": [12.5, 10.2, 9.8], + "warmTotalPromptTimeMs": [110.0, 95.0, 92.0], + "warmChunksPerSecond": [50.2, 55.1, 56.3] + } + + repetitions = 2 + for _ in range(repetitions): + for browser in self.browsers: + self._setup_run_js_expect(browser, probe_results) + + for browser in self.browsers: + browser.expected_js = copy.deepcopy(browser.expected_js) + + runner = Runner( + self.out_dir, + self.browsers, + benchmark, + env_config=EnvConfig(), + env_validation_mode=ValidationMode.SKIP, + platform=self.platform, + repetitions=repetitions, + throw=True, + in_memory_result_db=True) + + with mock.patch.object(self.benchmark_cls, "validate_url") as cm: + runner.run() + cm.assert_called_once() + + # Verification + for browser in self.browsers: + urls = self.filter_splashscreen_urls(browser.url_list) + self.assertEqual(len(urls), repetitions) + self.assertIn(self.story_cls.URL, urls) + self.assertListEqual(browser.expected_js, []) + + def test_run_custom_url(self): + custom_url = "http://test.example.com/blink_ai" + stories = self.story_cls.from_names(["language_model"], url=custom_url) + benchmark = self.benchmark_cls(stories) + + probe_results = { + "downloadTimeMs": 0.0, + "sessionCreationTimeMs": 100.0, + "coldTimeToFirstTokenMs": 40.0, + "coldTotalPromptTimeMs": 200.0, + "coldChunksPerSecond": 50.0, + "warmTimeToFirstTokenMs": [10.0], + "warmTotalPromptTimeMs": [90.0], + "warmChunksPerSecond": [55.0] + } + repetitions = 1 + for _ in range(repetitions): + for browser in self.browsers: + self._setup_run_js_expect(browser, probe_results) + + for browser in self.browsers: + browser.expected_js = copy.deepcopy(browser.expected_js) + + runner = Runner( + self.out_dir, + self.browsers, + benchmark, + env_config=EnvConfig(), + env_validation_mode=ValidationMode.SKIP, + platform=self.platform, + repetitions=repetitions, + throw=True, + in_memory_result_db=True) + + with mock.patch.object(self.benchmark_cls, "validate_url") as cm: + runner.run() + cm.assert_called_once() + + for browser in self.browsers: + urls = self.filter_splashscreen_urls(browser.url_list) + self.assertEqual(len(urls), repetitions) + self.assertIn(custom_url, urls) + self.assertListEqual(browser.expected_js, []) + + def test_run_error(self): + stories = self.story_cls.from_names(["language_model"]) + benchmark = self.benchmark_cls(stories) + + probe_results = {} + repetitions = 1 + active_browsers = self.browsers[:1] + for _ in range(repetitions): + for browser in active_browsers: + self._setup_run_js_expect(browser, probe_results, status="failed") + + for browser in active_browsers: + browser.expected_js = copy.deepcopy(browser.expected_js) + + runner = Runner( + self.out_dir, + active_browsers, + benchmark, + env_config=EnvConfig(), + env_validation_mode=ValidationMode.SKIP, + platform=self.platform, + repetitions=repetitions, + throw=True, + in_memory_result_db=True) + + with mock.patch.object(self.benchmark_cls, "validate_url") as cm: + with self.assertRaises(ValueError) as cm_err: + runner.run() + self.assertIn("Blink-AI Benchmark failed", str(cm_err.exception)) + cm.assert_called_once() + + for browser in active_browsers: + self.assertListEqual(browser.expected_js, []) + + +if __name__ == "__main__": + test_helper.run_pytest(__file__)
diff --git a/tests/crossbench/mock_browser.py b/tests/crossbench/mock_browser.py index 83cea44..172b497 100644 --- a/tests/crossbench/mock_browser.py +++ b/tests/crossbench/mock_browser.py
@@ -164,6 +164,12 @@ def show_url(self, url, target: str | None = None) -> None: self.url_list.append(url) + def trusted_click(self, selector: str) -> None: + self.js( + "const el = document.querySelector(arguments[0]); " + "if (el) el.click();", + arguments=[selector]) + @override def current_window_id(self) -> str: return str(self.tab_list[-1])