Add subsuite support to wptrunner (#39711)

* Add support for subsuites in wptrunner

Subsuites are alternate test configurations that are able to run as
part of a single test run (e.g. so that they can be run in parallel).

The mechanism for defining a subsuite is to provide the path to a json
file as the `--subsuite-file` command line argument. This has the
following structure:

{
  <subsuite_name>: {
     config: <object>,
     run_info: <object>,
     include: [<string>],
     tags: [<string>]
  }
}

All the per-subsuite fields are optional.

"config" defines the configuration of the subsuite. This is
interpreted by individual products, when deciding the browser_kwargs and
executor_kwargs for test groups in the subsuite.

"run_info" is additional configuration information for the
subsuite. This provides extra information that can be used for the
subsuite in metadata files. The subsuite name is automatically added
as a `subsuite` key in the metadata.

"include" is a list of test URL prefixes to include in the subsuite.

"tags" is a list of metadata tags for tests to include in the
subsuite.

Conceptually the implementation replaces all the objects that were
previously keyed on test_type by (subsuite, test_type). This means we
get different Browser and Executor instances for each subsuite. This
implementation means that we have to restart the browser between
subsuites even if that might theoretically be unnecessary (e.g. just
setting different prefs which can be changed at runtime). However
since this meets all of the known requirements, and it's simpler than
a design which allows multiple subsuites in the same browser/executor
instance, it seems like a reasonable tradeoff.

Differential Revision: https://phabricator.services.mozilla.com/D172000

* Add support for subsuites to firefox

This allows subsuites to depend on prefs or command line arguments

Differential Revision: https://phabricator.services.mozilla.com/D172001

* Add subsuite support to wptreport formatter

This adds a top-level key listing all subsuites defined in the current
suite, and their run_info updates, and gives each test result a
subsuite key naming the subsuite it belongs to.

Differential Revision: https://phabricator.services.mozilla.com/D172002

* Add support for subsuites in wpt-update

This just requires looking up the correct run_info per subsuite rather than assuming
a 1:1 relationship between subsuites and run_info data

Differential Revision: https://phabricator.services.mozilla.com/D172003

Co-authored-by: Weizhong Xia <weizhong@google.com>
Co-authored-by: Weizhong Xia <77710146+WeizhongX@users.noreply.github.com>
Co-authored-by: Jonathan Lee <jonathan-j-lee@users.noreply.github.com>
diff --git a/tools/wptrunner/wptrunner/browsers/content_shell.py b/tools/wptrunner/wptrunner/browsers/content_shell.py
index 45fb48a..82391af 100644
--- a/tools/wptrunner/wptrunner/browsers/content_shell.py
+++ b/tools/wptrunner/wptrunner/browsers/content_shell.py
@@ -42,7 +42,7 @@
     pass
 
 
-def browser_kwargs(logger, test_type, run_info_data, config, **kwargs):
+def browser_kwargs(logger, test_type, run_info_data, config, subsuite, **kwargs):
     args = []
     args.append("--ignore-certificate-errors-spki-list=%s" %
         ','.join(chrome_spki_certs.IGNORE_CERTIFICATE_ERRORS_SPKI_LIST))
@@ -58,6 +58,9 @@
     for arg in kwargs.get("binary_args", []):
         if arg not in args:
             args.append(arg)
+    for arg in subsuite.config.get("binary_args", []):
+        if arg not in args:
+            args.append(arg)
     args.append("-")
 
     return {"binary": kwargs["binary"],
@@ -126,13 +129,17 @@
         self._stdin_queue = Queue()
         self._io_stopped = Event()
 
-        self._stdout_reader = self._create_reader_thread(self._proc.stdout,
+        self._stdout_reader = self._create_reader_thread("stdout-reader",
+                                                         self._proc.stdout,
                                                          self._stdout_queue,
                                                          prefix=b"OUT: ")
-        self._stderr_reader = self._create_reader_thread(self._proc.stderr,
+        self._stderr_reader = self._create_reader_thread("stderr-reader",
+                                                         self._proc.stderr,
                                                          self._stderr_queue,
                                                          prefix=b"ERR: ")
-        self._stdin_writer = self._create_writer_thread(self._proc.stdin, self._stdin_queue)
+        self._stdin_writer = self._create_writer_thread("stdin-writer",
+                                                        self._proc.stdin,
+                                                        self._stdin_queue)
 
         # Content shell is likely still in the process of initializing. The actual waiting
         # for the startup to finish is done in the ContentShellProtocol.
@@ -164,7 +171,7 @@
 
         for thread in [self._stdout_reader, self._stderr_reader, self._stdin_writer]:
             if thread.is_alive():
-                self.logger.warning("Content shell IO threads did not shut down gracefully.")
+                self.logger.warning(f"Content shell IO thread {thread.name} did not shut down gracefully.")
                 return False
 
         stopped = not self.is_alive()
@@ -197,7 +204,7 @@
     def check_crash(self, process, test):
         return not self.is_alive()
 
-    def _create_reader_thread(self, stream, queue, prefix=b""):
+    def _create_reader_thread(self, name, stream, queue, prefix=b""):
         """This creates (and starts) a background thread which reads lines from `stream` and
         puts them into `queue` until `stream` reports EOF.
         """
@@ -213,11 +220,14 @@
             queue.close()
             queue.join_thread()
 
-        result = Thread(target=reader_thread, args=(stream, queue, self._io_stopped), daemon=True)
+        result = Thread(name=name,
+                        target=reader_thread,
+                        args=(stream, queue, self._io_stopped),
+                        daemon=True)
         result.start()
         return result
 
-    def _create_writer_thread(self, stream, queue):
+    def _create_writer_thread(self, name, stream, queue):
         """This creates (and starts) a background thread which gets items from `queue` and
         writes them into `stream` until it encounters a None item in the queue.
         """
@@ -230,6 +240,9 @@
                 stream.write(line)
                 stream.flush()
 
-        result = Thread(target=writer_thread, args=(stream, queue), daemon=True)
+        result = Thread(name=name,
+                        target=writer_thread,
+                        args=(stream, queue),
+                        daemon=True)
         result.start()
         return result
diff --git a/tools/wptrunner/wptrunner/browsers/firefox.py b/tools/wptrunner/wptrunner/browsers/firefox.py
index 096f851..24203fc 100644
--- a/tools/wptrunner/wptrunner/browsers/firefox.py
+++ b/tools/wptrunner/wptrunner/browsers/firefox.py
@@ -104,7 +104,7 @@
     require_arg(kwargs, "binary")
 
 
-def browser_kwargs(logger, test_type, run_info_data, config, **kwargs):
+def browser_kwargs(logger, test_type, run_info_data, config, subsuite, **kwargs):
     browser_kwargs = {"binary": kwargs["binary"],
                       "webdriver_binary": kwargs["webdriver_binary"],
                       "webdriver_args": kwargs["webdriver_args"],
@@ -134,6 +134,8 @@
                       "debug_test": kwargs["debug_test"]}
     if test_type == "wdspec" and kwargs["binary"]:
         browser_kwargs["webdriver_args"].extend(["--binary", kwargs["binary"]])
+    browser_kwargs["binary_args"].extend(subsuite.config.get("binary_args", []))
+    browser_kwargs["extra_prefs"].extend(subsuite.config.get("prefs", []))
     return browser_kwargs
 
 
@@ -237,8 +239,18 @@
 
 
 def update_properties():
-    return (["os", "debug", "fission", "processor", "swgl", "asan", "tsan", "editorLegacyDirectionMode"],
-            {"os": ["version"], "processor": ["bits"]})
+    return ([
+        "os",
+        "debug",
+        "fission",
+        "processor",
+        "swgl",
+        "asan",
+        "tsan",
+        "subsuite",
+        "editorLegacyDirectionMode"], {
+        "os": ["version"],
+        "processor": ["bits"]})
 
 
 def log_gecko_crashes(logger, process, test, profile_dir, symbols_path, stackwalk_binary):
diff --git a/tools/wptrunner/wptrunner/executors/base.py b/tools/wptrunner/wptrunner/executors/base.py
index b148365..fa8519b 100644
--- a/tools/wptrunner/wptrunner/executors/base.py
+++ b/tools/wptrunner/wptrunner/executors/base.py
@@ -21,14 +21,15 @@
 here = os.path.dirname(__file__)
 
 
-def executor_kwargs(test_type, test_environment, run_info_data, **kwargs):
+def executor_kwargs(test_type, test_environment, run_info_data, subsuite, **kwargs):
     timeout_multiplier = kwargs["timeout_multiplier"]
     if timeout_multiplier is None:
         timeout_multiplier = 1
 
     executor_kwargs = {"server_config": test_environment.config,
                        "timeout_multiplier": timeout_multiplier,
-                       "debug_info": kwargs["debug_info"]}
+                       "debug_info": kwargs["debug_info"],
+                       "subsuite": subsuite.name}
 
     if test_type in ("reftest", "print-reftest"):
         executor_kwargs["screenshot_cache"] = test_environment.cache_manager.dict()
@@ -262,13 +263,14 @@
 
 
     def __init__(self, logger, browser, server_config, timeout_multiplier=1,
-                 debug_info=None, **kwargs):
+                 debug_info=None, subsuite=None, **kwargs):
         self.logger = logger
         self.runner = None
         self.browser = browser
         self.server_config = server_config
         self.timeout_multiplier = timeout_multiplier
         self.debug_info = debug_info
+        self.subsuite = subsuite
         self.last_environment = {"protocol": "http",
                                  "prefs": {}}
         self.protocol = None  # This must be set in subclasses
@@ -383,6 +385,7 @@
     def __init__(self, executor):
         self.timeout_multiplier = executor.timeout_multiplier
         self.executor = executor
+        self.subsuite = executor.subsuite
         # Cache of url:(screenshot hash, screenshot). Typically the
         # screenshot is None, but we set this value if a test fails
         # and the screenshot was taken from the cache so that we may
@@ -402,7 +405,7 @@
         return self.executor.logger
 
     def get_hash(self, test, viewport_size, dpi, page_ranges):
-        key = (test.url, viewport_size, dpi)
+        key = (self.subsuite, test.url, viewport_size, dpi)
 
         if key not in self.screenshot_cache:
             success, data = self.get_screenshot_list(test, viewport_size, dpi, page_ranges)
diff --git a/tools/wptrunner/wptrunner/formatters/wptreport.py b/tools/wptrunner/wptrunner/formatters/wptreport.py
index 21c1211..5919631 100644
--- a/tools/wptrunner/wptrunner/formatters/wptreport.py
+++ b/tools/wptrunner/wptrunner/formatters/wptreport.py
@@ -26,28 +26,30 @@
         self.results = {}
 
     def suite_start(self, data):
-        if 'run_info' in data:
-            self.results['run_info'] = data['run_info']
+        self.results['run_info'] = data.get('run_info', {})
         self.results['time_start'] = data['time']
         self.results["results"] = []
+        self.results["subsuites"] = {}
+
+    def add_subsuite(self, data):
+        self.results["subsuites"][data["name"]] = data.get("run_info", {})
 
     def suite_end(self, data):
         self.results['time_end'] = data['time']
-        for test_name in self.raw_results:
-            result = {"test": test_name}
-            result.update(self.raw_results[test_name])
-            self.results["results"].append(result)
+        for subsuite, results in self.raw_results.items():
+            for test_name, result in results.items():
+                result["test"] = test_name
+                result["subsuite"] = subsuite
+                self.results["results"].append(result)
         return json.dumps(self.results) + "\n"
 
     def find_or_create_test(self, data):
+        subsuite = data.get("subsuite", "")
         test_name = data["test"]
-        if test_name not in self.raw_results:
-            self.raw_results[test_name] = {
-                "subtests": [],
-                "status": "",
-                "message": None
-            }
-        return self.raw_results[test_name]
+        subsuite_results = self.raw_results.setdefault(subsuite, {})
+        return subsuite_results.setdefault(test_name, {"subtests": [],
+                                                      "status": "",
+                                                      "message": None})
 
     def test_start(self, data):
         test = self.find_or_create_test(data)
@@ -94,10 +96,12 @@
                 if isinstance(item, dict)
             }
         test_name = data["test"]
-        result = {"test": data["test"]}
-        result.update(self.raw_results[test_name])
+        subsuite = data.get("subsuite", "")
+        result = {"test": test_name,
+                  "subsuite": subsuite}
+        result.update(self.raw_results[subsuite][test_name])
         self.results["results"].append(result)
-        self.raw_results.pop(test_name)
+        self.raw_results[subsuite].pop(test_name)
 
     def assertion_count(self, data):
         test = self.find_or_create_test(data)
@@ -113,7 +117,8 @@
         lsan_leaks = self.results["lsan_leaks"]
         lsan_leaks.append({"frames": data["frames"],
                            "scope": data["scope"],
-                           "allowed_match": data.get("allowed_match")})
+                           "allowed_match": data.get("allowed_match"),
+                           "subsuite": data.get("subsuite", "")})
 
     def find_or_create_mozleak(self, data):
         if "mozleak" not in self.results:
@@ -128,10 +133,12 @@
         scope_data["objects"].append({"process": data["process"],
                                       "name": data["name"],
                                       "allowed": data.get("allowed", False),
-                                      "bytes": data["bytes"]})
+                                      "bytes": data["bytes"],
+                                      "subsuite": data.get("subsuite", "")})
 
     def mozleak_total(self, data):
         scope_data = self.find_or_create_mozleak(data)
         scope_data["total"].append({"bytes": data["bytes"],
                                     "threshold": data.get("threshold", 0),
-                                    "process": data["process"]})
+                                    "process": data["process"],
+                                    "subsuite": data.get("subsuite", "")})
diff --git a/tools/wptrunner/wptrunner/manifestupdate.py b/tools/wptrunner/wptrunner/manifestupdate.py
index a919d6d..26d1d68 100644
--- a/tools/wptrunner/wptrunner/manifestupdate.py
+++ b/tools/wptrunner/wptrunner/manifestupdate.py
@@ -944,6 +944,8 @@
         node = ListNode()
         for item in value:
             node.append(make_node(item))
+    else:
+        raise ValueError(f"Unrecoginsed data type {type(value)}")
     return node
 
 
diff --git a/tools/wptrunner/wptrunner/metadata.py b/tools/wptrunner/wptrunner/metadata.py
index d026fdf..b9cb61e 100644
--- a/tools/wptrunner/wptrunner/metadata.py
+++ b/tools/wptrunner/wptrunner/metadata.py
@@ -244,7 +244,7 @@
 
 
 prop_intern = InternedData(4)
-run_info_intern = InternedData(8)
+run_info_intern = InternedData(16)
 status_intern = InternedData(4)
 
 
@@ -373,8 +373,10 @@
 class ExpectedUpdater:
     def __init__(self, id_test_map):
         self.id_test_map = id_test_map
-        self.run_info = None
+        self.base_run_info = None
+        self.run_info_by_subsuite = {}
         self.action_map = {"suite_start": self.suite_start,
+                           "add_subsuite": self.add_subsuite,
                            "test_start": self.test_start,
                            "test_status": self.test_status,
                            "test_end": self.test_end,
@@ -391,7 +393,8 @@
         # * raw log format
 
         # Try reading a single json object in wptreport format
-        self.run_info = None
+        self.base_run_info = None
+        self.run_info_by_subsuite = {}
         success = self.get_wptreport_data(log_file.read())
 
         if success:
@@ -436,21 +439,27 @@
     def update_from_wptreport_log(self, data):
         action_map = self.action_map
         action_map["suite_start"]({"run_info": data["run_info"]})
+        for subsuite, run_info in data.get("subsuites", {}).items():
+            action_map["add_subsuite"]({"name": subsuite, "run_info": run_info})
         for test in data["results"]:
-            action_map["test_start"]({"test": test["test"]})
+            action_map["test_start"]({"test": test["test"],
+                                      "subsuite": test.get("subsuite", "")})
             for subtest in test["subtests"]:
                 action_map["test_status"]({"test": test["test"],
+                                           "subsuite": test.get("subsuite", ""),
                                            "subtest": subtest["name"],
                                            "status": subtest["status"],
                                            "expected": subtest.get("expected"),
                                            "known_intermittent": subtest.get("known_intermittent", [])})
             action_map["test_end"]({"test": test["test"],
+                                    "subsuite": test.get("subsuite", ""),
                                     "status": test["status"],
                                     "expected": test.get("expected"),
                                     "known_intermittent": test.get("known_intermittent", [])})
             if "asserts" in test:
                 asserts = test["asserts"]
                 action_map["assertion_count"]({"test": test["test"],
+                                               "subsuite": data.get("subsuite", ""),
                                                "count": asserts["count"],
                                                "min_expected": asserts["min"],
                                                "max_expected": asserts["max"]})
@@ -467,7 +476,16 @@
                     action_map[action](item_data)
 
     def suite_start(self, data):
-        self.run_info = run_info_intern.store(RunInfo(data["run_info"]))
+        self.base_run_info = data["run_info"]
+        run_info = RunInfo(data["run_info"])
+        self.run_info_by_subsuite[""] = run_info_intern.store(run_info)
+
+    def add_subsuite(self, data):
+        run_info_data = self.base_run_info.copy()
+        run_info_data.update(data["run_info"])
+        run_info = RunInfo(run_info_data)
+        name = data["name"]
+        self.run_info_by_subsuite[name] = run_info_intern.store(run_info)
 
     def test_start(self, data):
         test_id = intern(ensure_str(data["test"]))
@@ -490,7 +508,7 @@
 
         result = pack_result(data)
 
-        test_data.set(test_id, subtest, "status", self.run_info, result)
+        test_data.set(test_id, subtest, "status", self.run_info_by_subsuite[data.get("subsuite", "")], result)
         status = data["status"]
         expected = data.get("expected")
         if expected and expected != status and status not in data.get("known_intermittent", []):
@@ -507,7 +525,7 @@
 
         result = pack_result(data)
 
-        test_data.set(test_id, None, "status", self.run_info, result)
+        test_data.set(test_id, None, "status", self.run_info_by_subsuite[data.get("subsuite", "")], result)
         status = data["status"]
         expected = data.get("expected")
         if expected and expected != status and status not in data.get("known_intermittent", []):
@@ -520,7 +538,7 @@
         if test_data is None:
             return
 
-        test_data.set(test_id, None, "asserts", self.run_info, data["count"])
+        test_data.set(test_id, None, "asserts", self.run_info_by_subsuite[data.get("subsuite", "")], data["count"])
         if data["count"] < data["min_expected"] or data["count"] > data["max_expected"]:
             test_data.set_requires_update()
 
@@ -537,7 +555,7 @@
             return
         dir_id, test_data = self.test_for_scope(data)
         test_data.set(dir_id, None, "lsan",
-                      self.run_info, (data["frames"], data.get("allowed_match")))
+                      self.run_info_by_subsuite[data.get("subsuite", "")], (data["frames"], data.get("allowed_match")))
         if not data.get("allowed_match"):
             test_data.set_requires_update()
 
@@ -547,7 +565,7 @@
             return
         dir_id, test_data = self.test_for_scope(data)
         test_data.set(dir_id, None, "leak-object",
-                      self.run_info, ("%s:%s", (data["process"], data["name"]),
+                      self.run_info_by_subsuite[data.get("subsuite", "")], ("%s:%s", (data["process"], data["name"]),
                                       data.get("allowed")))
         if not data.get("allowed"):
             test_data.set_requires_update()
@@ -559,7 +577,7 @@
         if data["bytes"]:
             dir_id, test_data = self.test_for_scope(data)
             test_data.set(dir_id, None, "leak-threshold",
-                          self.run_info, (data["process"], data["bytes"], data["threshold"]))
+                          self.run_info_by_subsuite[data.get("subsuite", "")], (data["process"], data["bytes"], data["threshold"]))
             if data["bytes"] > data["threshold"] or data["bytes"] < 0:
                 test_data.set_requires_update()
 
@@ -605,10 +623,11 @@
 class PackedResultList:
     """Class for storing test results.
 
-    Results are stored as an array of 2-byte integers for compactness.
-    The first 4 bits represent the property name, the second 4 bits
+    Results are stored as an array of 4-byte integers for compactness
+    with the first 8 bits reserved. In the remaining 24 bits,
+    the first 4 bits represent the property name, the second 4 bits
     represent the test status (if it's a result with a status code), and
-    the final 8 bits represent the run_info. If the result doesn't have a
+    the final 16 bits represent the run_info. If the result doesn't have a
     simple status code but instead a richer type, we place that richer type
     in a dictionary and set the status part of the result type to 0.
 
@@ -617,14 +636,14 @@
     and corresponding Python objects."""
 
     def __init__(self):
-        self.data = array.array("H")
+        self.data = array.array("L")
 
     __slots__ = ("data", "raw_data")
 
     def append(self, prop, run_info, value):
-        out_val = (prop << 12) + run_info
+        out_val = (prop << 20) + run_info
         if prop == prop_intern.store("status") and isinstance(value, int):
-            out_val += value << 8
+            out_val += value << 16
         else:
             if not hasattr(self, "raw_data"):
                 self.raw_data = {}
@@ -632,15 +651,15 @@
         self.data.append(out_val)
 
     def unpack(self, idx, packed):
-        prop = prop_intern.get((packed & 0xF000) >> 12)
+        prop = prop_intern.get((packed & 0xF00000) >> 20)
 
-        value_idx = (packed & 0x0F00) >> 8
+        value_idx = (packed & 0x0F0000) >> 16
         if value_idx == 0:
             value = self.raw_data[idx]
         else:
             value = status_intern.get(value_idx)
 
-        run_info = run_info_intern.get(packed & 0x00FF)
+        run_info = run_info_intern.get(packed & 0x00FFFF)
 
         return prop, run_info, value
 
diff --git a/tools/wptrunner/wptrunner/testloader.py b/tools/wptrunner/wptrunner/testloader.py
index 86004b8..cc41dfa 100644
--- a/tools/wptrunner/wptrunner/testloader.py
+++ b/tools/wptrunner/wptrunner/testloader.py
@@ -1,4 +1,5 @@
 # mypy: allow-untyped-defs
+from __future__ import annotations
 
 import abc
 import hashlib
@@ -9,7 +10,7 @@
 from abc import ABCMeta, abstractmethod
 from queue import Empty
 from collections import defaultdict, deque, namedtuple
-from typing import Any, cast
+from typing import cast, Any, Dict, List, Optional, Set
 
 from . import manifestinclude
 from . import manifestexpected
@@ -31,29 +32,108 @@
     from manifest.download import download_from_github  # type: ignore
 
 
-class TestGroupsFile:
-    """
-    Mapping object representing {group name: [test ids]}
-    """
-
-    def __init__(self, logger, path):
+class TestGroups:
+    def __init__(self, logger, path, subsuites):
         try:
             with open(path) as f:
-                self._data = json.load(f)
+                data = json.load(f)
         except ValueError:
             logger.critical("test groups file %s not valid json" % path)
             raise
 
+        self.tests_by_group = defaultdict(set)
         self.group_by_test = {}
-        for group, test_ids in self._data.items():
+        for group, test_ids in data.items():
+            id_parts = group.split(":", 1)
+            if len(id_parts) == 1:
+                group_name = id_parts[0]
+                subsuite = ""
+            else:
+                subsuite, group_name = id_parts
+                if subsuite not in subsuites:
+                    raise ValueError(f"Unknown subsuite {subsuite} in group data {group}")
             for test_id in test_ids:
-                self.group_by_test[test_id] = group
+                self.group_by_test[(subsuite, test_id)] = group_name
+                self.tests_by_group[group_name].add(test_id)
 
-    def __contains__(self, key):
-        return key in self._data
 
-    def __getitem__(self, key):
-        return self._data[key]
+def load_subsuites(logger: Any,
+                   base_run_info: wpttest.RunInfo,
+                   path: Optional[str],
+                   include_subsuites: Set[str]) -> Dict[str, Subsuite]:
+    subsuites: Dict[str, Subsuite] = {}
+    run_all_subsuites = not include_subsuites
+    include_subsuites.add("")
+
+    def maybe_add_subsuite(name: str, data: Dict[str, Any]) -> None:
+        if run_all_subsuites or name in include_subsuites:
+            subsuites[name] = Subsuite(name,
+                                       data.get("config", {}),
+                                       base_run_info,
+                                       run_info_extras=data.get("run_info", {}),
+                                       include=data.get("include"),
+                                       tags=set(data["tags"]) if "tags" in data else None)
+            if name in include_subsuites:
+                include_subsuites.remove(name)
+
+    maybe_add_subsuite("", {})
+
+    if path is None:
+        if include_subsuites:
+            raise ValueError("Unrecognised subsuites {','.join(include_subsuites)}, missing --subsuite-file?")
+        return subsuites
+
+    try:
+        with open(path) as f:
+            data = json.load(f)
+    except ValueError:
+        logger.critical("subsuites file %s not valid json" % path)
+        raise
+
+    for key, subsuite in data.items():
+        if key == "":
+            raise ValueError("Subsuites must have a non-empty name")
+        maybe_add_subsuite(key, subsuite)
+
+    if include_subsuites:
+        raise ValueError(f"Unrecognised subsuites {','.join(include_subsuites)}")
+
+    return subsuites
+
+
+class Subsuite:
+    def __init__(self,
+                 name: str,
+                 config: Dict[str, Any],
+                 base_run_info: Optional[wpttest.RunInfo] = None,
+                 run_info_extras: Optional[Dict[str, Any]] = None,
+                 include: Optional[List[str]] = None,
+                 tags: Optional[Set[str]] = None):
+        self.name = name
+        self.config = config
+        self.run_info_extras = run_info_extras or {}
+        self.run_info_extras["subsuite"] = name
+        self.include = include
+        self.tags = tags
+
+        run_info = base_run_info.copy() if base_run_info is not None else {}
+        run_info.update(self.run_info_extras)
+        self.run_info = run_info
+
+    def manifest_filters(self, manifests):
+        if self.name:
+            manifest_filters = [TestFilter(manifests,
+                                           include=self.include,
+                                           explicit=True)]
+            return manifest_filters
+
+        # use base manifest_filters for default subsuite
+        return []
+
+    def __repr__(self):
+        return "Subsuite('%s', config:%s, run_info:%s)" % (self.name or 'default',
+                                                           str(self.config),
+                                                           str(self.run_info))
 
 
 def read_include_from_file(file):
@@ -74,8 +154,8 @@
         return
     new_include = []
     for item in include:
-        if item in test_groups:
-            new_include.extend(test_groups[item])
+        if item in test_groups.tests_by_group:
+            new_include.extend(test_groups.tests_by_group[item])
         else:
             new_include.append(item)
     return new_include
@@ -240,7 +320,8 @@
     def __init__(self,
                  test_manifests,
                  test_types,
-                 run_info,
+                 base_run_info,
+                 subsuites=None,
                  manifest_filters=None,
                  test_filters=None,
                  chunk_type="none",
@@ -254,7 +335,8 @@
                  chunker_kwargs=None):
 
         self.test_types = test_types
-        self.run_info = run_info
+        self.base_run_info = base_run_info
+        self.subsuites = subsuites or {}
 
         self.manifest_filters = manifest_filters if manifest_filters is not None else []
         self.test_filters = test_filters if test_filters is not None else []
@@ -284,7 +366,6 @@
         self._test_ids = None
 
         self.directory_manifests = {}
-
         self._load_tests()
 
     @property
@@ -292,8 +373,9 @@
         if self._test_ids is None:
             self._test_ids = []
             for test_dict in [self.disabled_tests, self.tests]:
-                for test_type in self.test_types:
-                    self._test_ids += [item.id for item in test_dict[test_type]]
+                for subsuite in self.subsuites:
+                    for test_type in self.test_types:
+                        self._test_ids += [item.id for item in test_dict[subsuite][test_type]]
         return self._test_ids
 
     def get_test(self, manifest_file, manifest_test, inherit_metadata, test_metadata):
@@ -303,31 +385,31 @@
 
         return wpttest.from_manifest(manifest_file, manifest_test, inherit_metadata, test_metadata)
 
-    def load_dir_metadata(self, test_manifest, metadata_path, test_path):
+    def load_dir_metadata(self, run_info, test_manifest, metadata_path, test_path):
         rv = []
         path_parts = os.path.dirname(test_path).split(os.path.sep)
         for i in range(len(path_parts) + 1):
             path = os.path.join(metadata_path, os.path.sep.join(path_parts[:i]), "__dir__.ini")
             if path not in self.directory_manifests:
                 self.directory_manifests[path] = manifestexpected.get_dir_manifest(path,
-                                                                                   self.run_info)
+                                                                                   run_info)
             manifest = self.directory_manifests[path]
             if manifest is not None:
                 rv.append(manifest)
         return rv
 
-    def load_metadata(self, test_manifest, metadata_path, test_path):
-        inherit_metadata = self.load_dir_metadata(test_manifest, metadata_path, test_path)
+    def load_metadata(self, run_info, test_manifest, metadata_path, test_path):
+        inherit_metadata = self.load_dir_metadata(run_info, test_manifest, metadata_path, test_path)
         test_metadata = manifestexpected.get_manifest(
-            metadata_path, test_path, self.run_info)
+            metadata_path, test_path, run_info)
         return inherit_metadata, test_metadata
 
-    def iter_tests(self):
+    def iter_tests(self, run_info, manifest_filters):
         manifest_items = []
         manifests_by_url_base = {}
 
         for manifest in sorted(self.manifests.keys(), key=lambda x:x.url_base):
-            manifest_iter = iterfilter(self.manifest_filters,
+            manifest_iter = iterfilter(manifest_filters,
                                        manifest.itertypes(*self.test_types))
             manifest_items.extend(manifest_iter)
             manifests_by_url_base[manifest.url_base] = manifest
@@ -339,32 +421,40 @@
             manifest_file = manifests_by_url_base[next(iter(tests)).url_base]
             metadata_path = self.manifests[manifest_file]["metadata_path"]
 
-            inherit_metadata, test_metadata = self.load_metadata(manifest_file, metadata_path, test_path)
+            inherit_metadata, test_metadata = self.load_metadata(run_info, manifest_file, metadata_path, test_path)
             for test in tests:
                 wpt_test = self.get_test(manifest_file, test, inherit_metadata, test_metadata)
                 if all(f(wpt_test) for f in self.test_filters):
                     yield test_path, test_type, wpt_test
 
     def _load_tests(self):
-        """Read in the tests from the manifest file and add them to a queue"""
-        tests = {"enabled":defaultdict(list),
-                 "disabled":defaultdict(list)}
+        """Read in the tests from the manifest file"""
+        tests_enabled = {}
+        tests_disabled = {}
 
-        for test_path, test_type, test in self.iter_tests():
-            enabled = not test.disabled()
-            if not self.include_https and test.environment["protocol"] == "https":
-                enabled = False
-            if not self.include_h2 and test.environment["protocol"] == "h2":
-                enabled = False
-            if self.skip_timeout and test.expected() == "TIMEOUT":
-                enabled = False
-            if self.skip_implementation_status and test.implementation_status() in self.skip_implementation_status:
-                enabled = False
-            key = "enabled" if enabled else "disabled"
-            tests[key][test_type].append(test)
+        for subsuite_name, subsuite in self.subsuites.items():
+            tests_enabled[subsuite_name] = defaultdict(list)
+            tests_disabled[subsuite_name] = defaultdict(list)
+            run_info = subsuite.run_info
+            if not subsuite_name:
+                manifest_filters = self.manifest_filters
+            else:
+                manifest_filters = subsuite.manifest_filters(self.manifests)
+            for test_path, test_type, test in self.iter_tests(run_info, manifest_filters):
+                enabled = not test.disabled()
+                if not self.include_https and test.environment["protocol"] == "https":
+                    enabled = False
+                if not self.include_h2 and test.environment["protocol"] == "h2":
+                    enabled = False
+                if self.skip_timeout and test.expected() == "TIMEOUT":
+                    enabled = False
+                if self.skip_implementation_status and test.implementation_status() in self.skip_implementation_status:
+                    enabled = False
+                target = tests_enabled if enabled else tests_disabled
+                target[subsuite_name][test_type].append(test)
 
-        self.tests = tests["enabled"]
-        self.disabled_tests = tests["disabled"]
+        self.tests = tests_enabled
+        self.disabled_tests = tests_disabled
 
     def groups(self, test_types, chunk_type="none", total_chunks=1, chunk_number=1):
         groups = set()
@@ -377,6 +467,9 @@
         return groups
 
 
+TestSourceData = namedtuple("TestSourceData", ["cls", "kwargs"])
+
+
 def get_test_src(**kwargs):
     test_source_kwargs = {"processes": kwargs["processes"],
                           "logger": kwargs["logger"]}
@@ -391,10 +484,10 @@
         test_source_kwargs["test_groups"] = kwargs["test_groups"]
     else:
         test_source_cls = SingleTestSource
-    return test_source_cls, test_source_kwargs, chunker_kwargs
+    return TestSourceData(test_source_cls, test_source_kwargs), chunker_kwargs
 
 
-TestGroup = namedtuple("TestGroup", ["group", "test_type", "metadata"])
+TestGroup = namedtuple("TestGroup", ["group", "subsuite", "test_type", "metadata"])
 
 
 class TestSource:
@@ -402,7 +495,7 @@
 
     def __init__(self, test_queue):
         self.test_queue = test_queue
-        self.current_group = TestGroup(None, None, None)
+        self.current_group = TestGroup(None, None, None, None)
         self.logger = structured.get_default_logger()
         if self.logger is None:
             self.logger = structured.structuredlog.StructuredLogger("TestSource")
@@ -427,11 +520,12 @@
         cls.add_sentinal(test_queue, processes)
         return test_queue, processes
 
+    @classmethod
     @abstractmethod
-    #@classmethod (doesn't compose with @abstractmethod in < 3.3)
     def make_groups(cls, tests_by_type, **kwargs):  # noqa: N805
         pass
 
+    @classmethod
     @abstractmethod
     def tests_by_group(cls, tests_by_type, **kwargs):  # noqa: N805
         pass
@@ -446,14 +540,14 @@
                 self.current_group = self.test_queue.get(block=True, timeout=5)
             except Empty:
                 self.logger.warning("Timed out getting test group from queue")
-                return TestGroup(None, None, None)
+                return TestGroup(None, None, None, None)
         return self.current_group
 
     @classmethod
     def add_sentinal(cls, test_queue, num_of_workers):
         # add one sentinal for each worker
         for _ in range(num_of_workers):
-            test_queue.put(TestGroup(None, None, None))
+            test_queue.put(TestGroup(None, None, None, None))
 
     @classmethod
     def process_count(cls, requested_processes, num_test_groups):
@@ -463,42 +557,11 @@
         return max(1, min(requested_processes, num_test_groups))
 
 
-class GroupedSource(TestSource):
-    @classmethod
-    def new_group(cls, state, test_type, test, **kwargs):
-        raise NotImplementedError
-
-    @classmethod
-    def make_groups(cls, tests_by_type, **kwargs):
-        groups, state = [], {}
-        for test_type, tests in tests_by_type.items():
-            for test in tests:
-                if cls.new_group(state, test_type, test, **kwargs):
-                    group_metadata = cls.group_metadata(state)
-                    groups.append(TestGroup(deque(), test_type, group_metadata))
-                group, _, metadata = groups[-1]
-                group.append(test)
-                test.update_metadata(metadata)
-        return groups
-
-    @classmethod
-    def tests_by_group(cls, tests_by_type, **kwargs):
-        groups = defaultdict(list)
-        state = {}
-        current = None
-        for test_type, tests in tests_by_type.items():
-            for test in tests:
-                if cls.new_group(state, test_type, test, **kwargs):
-                    current = cls.group_metadata(state)['scope']
-                groups[current].append(test.id)
-        return groups
-
-
 class SingleTestSource(TestSource):
     @classmethod
     def make_groups(cls, tests_by_type, **kwargs):
         groups = []
-        for test_type, tests in tests_by_type.items():
+        for (subsuite, test_type), tests in tests_by_type.items():
             processes = kwargs["processes"]
             queues = [deque([]) for _ in range(processes)]
             metadatas = [cls.group_metadata(None) for _ in range(processes)]
@@ -509,41 +572,73 @@
                 group.append(test)
                 test.update_metadata(metadata)
 
-            for item in zip(queues, itertools.repeat(test_type), metadatas):
+            for item in zip(queues,
+                            itertools.repeat(subsuite),
+                            itertools.repeat(test_type),
+                            metadatas):
                 if len(item[0]) > 0:
                     groups.append(TestGroup(*item))
         return groups
 
     @classmethod
     def tests_by_group(cls, tests_by_type, **kwargs):
-        return {cls.group_metadata(None)['scope']:
-                [t.id for t in itertools.chain.from_iterable(tests_by_type.values())]}
+        rv = {}
+        for (subsuite, test_type), tests in tests_by_type.items():
+            group_name = f"{subsuite}:{cls.group_metadata(None)['scope']}"
+            rv[group_name] = [t.id for t in tests]
+        return rv
 
 
-class PathGroupedSource(GroupedSource):
+class PathGroupedSource(TestSource):
     @classmethod
-    def new_group(cls, state, test_type, test, **kwargs):
+    def new_group(cls, state, subsuite, test_type, test, **kwargs):
         depth = kwargs.get("depth")
         if depth is True or depth == 0:
             depth = None
         path = urlsplit(test.url).path.split("/")[1:-1][:depth]
-        rv = (test_type != state.get("prev_test_type") or
-              path != state.get("prev_path"))
-        state["prev_test_type"] = test_type
-        state["prev_path"] = path
+        rv = (subsuite, test_type, path) != state.get("prev_group_key")
+        state["prev_group_key"] = (subsuite, test_type, path)
         return rv
 
     @classmethod
+    def make_groups(cls, tests_by_type, **kwargs):
+        groups, state = [], {}
+        for (subsuite, test_type), tests in tests_by_type.items():
+            for test in tests:
+                if cls.new_group(state, subsuite, test_type, test, **kwargs):
+                    group_metadata = cls.group_metadata(state)
+                    groups.append(TestGroup(deque(), subsuite, test_type, group_metadata))
+                group, _, _, metadata = groups[-1]
+                group.append(test)
+                test.update_metadata(metadata)
+        return groups
+
+    @classmethod
+    def tests_by_group(cls, tests_by_type, **kwargs):
+        groups = defaultdict(list)
+        state = {}
+        for (subsuite, test_type), tests in tests_by_type.items():
+            for test in tests:
+                if cls.new_group(state, subsuite, test_type, test, **kwargs):
+                    group = cls.group_metadata(state)['scope']
+                if subsuite is not None:
+                    group_name = f"{subsuite}:{group}"
+                else:
+                    group_name = group
+                groups[group_name].append(test.id)
+        return groups
+
+    @classmethod
     def group_metadata(cls, state):
-        return {"scope": "/%s" % "/".join(state["prev_path"])}
+        return {"scope": "/%s" % "/".join(state["prev_group_key"][2])}
 
 
 class GroupFileTestSource(TestSource):
     @classmethod
     def make_groups(cls, tests_by_type, **kwargs):
         groups = []
-        for test_type, tests in tests_by_type.items():
-            tests_by_group = cls.tests_by_group({test_type: tests},
+        for (subsuite, test_type), tests in tests_by_type.items():
+            tests_by_group = cls.tests_by_group({(subsuite, test_type): tests},
                                                 **kwargs)
             ids_to_tests = {test.id: test for test in tests}
             for group_name, test_ids in tests_by_group.items():
@@ -553,7 +648,7 @@
                     test = ids_to_tests[test_id]
                     group.append(test)
                     test.update_metadata(group_metadata)
-                groups.append(TestGroup(group, test_type, group_metadata))
+                groups.append(TestGroup(group, subsuite, test_type, group_metadata))
         return groups
 
     @classmethod
@@ -562,12 +657,17 @@
         test_groups = kwargs["test_groups"]
 
         tests_by_group = defaultdict(list)
-        for test in itertools.chain.from_iterable(tests_by_type.values()):
-            try:
-                group = test_groups.group_by_test[test.id]
-            except KeyError:
-                logger.error("%s is missing from test groups file" % test.id)
-                raise
-            tests_by_group[group].append(test.id)
+        for (subsuite, test_type), tests in tests_by_type.items():
+            for test in tests:
+                try:
+                    group = test_groups.group_by_test[(subsuite, test.id)]
+                except KeyError:
+                    logger.error("%s is missing from test groups file" % test.id)
+                    raise
+                if subsuite:
+                    group_name = f"{subsuite}:{group}"
+                else:
+                    group_name = group
+                tests_by_group[group_name].append(test.id)
 
         return tests_by_group
diff --git a/tools/wptrunner/wptrunner/testrunner.py b/tools/wptrunner/wptrunner/testrunner.py
index 9dae8b0..af91167 100644
--- a/tools/wptrunner/wptrunner/testrunner.py
+++ b/tools/wptrunner/wptrunner/testrunner.py
@@ -4,8 +4,8 @@
 import time
 import traceback
 from queue import Empty
-from collections import namedtuple
-from typing import Optional
+from collections import namedtuple, defaultdict
+from typing import Any, Mapping, Optional
 
 from mozlog import structuredlog, capture
 
@@ -255,10 +255,10 @@
 class _RunnerManagerState:
     before_init = namedtuple("before_init", [])
     initializing = namedtuple("initializing",
-                              ["test_type", "test", "test_group",
+                              ["subsuite", "test_type", "test", "test_group",
                                "group_metadata", "failure_count"])
-    running = namedtuple("running", ["test_type", "test", "test_group", "group_metadata"])
-    restarting = namedtuple("restarting", ["test_type", "test", "test_group",
+    running = namedtuple("running", ["subsuite", "test_type", "test", "test_group", "group_metadata"])
+    restarting = namedtuple("restarting", ["subsuite", "test_type", "test", "test_group",
                                            "group_metadata", "force_stop"])
     error = namedtuple("error", [])
     stop = namedtuple("stop", ["force_stop"])
@@ -269,7 +269,7 @@
 
 class TestRunnerManager(threading.Thread):
     def __init__(self, suite_name, index, test_queue, test_source_cls,
-                 test_implementation_by_type, stop_flag, rerun=1,
+                 test_implementations, stop_flag, rerun=1,
                  pause_after_test=False, pause_on_unexpected=False,
                  restart_on_unexpected=True, debug_info=None,
                  capture_stdio=True, restart_on_new_group=True, recording=None, max_restarts=5):
@@ -293,22 +293,22 @@
         self.test_source = test_source_cls(test_queue)
 
         self.manager_number = index
-        self.test_type = None
+        self.test_implementation_key = None
 
-        self.test_implementation_by_type = {}
-        for test_type, test_implementation in test_implementation_by_type.items():
+        self.test_implementations = {}
+        for key, test_implementation in test_implementations.items():
             browser_kwargs = test_implementation.browser_kwargs
             if browser_kwargs.get("device_serial"):
                 browser_kwargs = browser_kwargs.copy()
                 # Assign Android device to runner according to current manager index
                 browser_kwargs["device_serial"] = browser_kwargs["device_serial"][index]
-                self.test_implementation_by_type[test_type] = TestImplementation(
+                self.test_implementations[key] = TestImplementation(
                     test_implementation.executor_cls,
                     test_implementation.executor_kwargs,
                     test_implementation.browser_cls,
                     browser_kwargs)
             else:
-                self.test_implementation_by_type[test_type] = test_implementation
+                self.test_implementations[key] = test_implementation
 
         mp = mpcontext.get_context()
 
@@ -336,8 +336,8 @@
         self.logger = None
 
         self.test_count = 0
-        self.unexpected_tests = set()
-        self.unexpected_pass_tests = set()
+        self.unexpected_fail_tests = defaultdict(list)
+        self.unexpected_pass_tests = defaultdict(list)
 
         # This may not really be what we want
         self.daemon = True
@@ -436,7 +436,8 @@
             self.logger.debug("Got command: %r" % command)
         except OSError:
             self.logger.error("Got IOError from poll")
-            return RunnerManagerState.restarting(self.state.test_type,
+            return RunnerManagerState.restarting(self.state.subsuite,
+                                                 self.state.test_type,
                                                  self.state.test,
                                                  self.state.test_group,
                                                  self.state.group_metadata,
@@ -483,12 +484,12 @@
         return self.child_stop_flag.is_set() or self.parent_stop_flag.is_set()
 
     def start_init(self):
-        test_type, test, test_group, group_metadata = self.get_next_test()
+        subsuite, test_type, test, test_group, group_metadata = self.get_next_test()
         self.recording.set(["testrunner", "init"])
         if test is None:
             return RunnerManagerState.stop(True)
         else:
-            return RunnerManagerState.initializing(test_type, test, test_group, group_metadata, 0)
+            return RunnerManagerState.initializing(subsuite, test_type, test, test_group, group_metadata, 0)
 
     def init(self):
         assert isinstance(self.state, RunnerManagerState.initializing)
@@ -496,11 +497,11 @@
             self.logger.critical("Max restarts exceeded")
             return RunnerManagerState.error()
 
-        if self.state.test_type != self.test_type:
+        if (self.state.subsuite, self.state.test_type) != self.test_implementation_key:
             if self.browser is not None:
                 assert self.browser.browser is not None
                 self.browser.browser.cleanup()
-            impl = self.test_implementation_by_type[self.state.test_type]
+            impl = self.test_implementations[(self.state.subsuite, self.state.test_type)]
             browser = impl.browser_cls(self.logger, remote_queue=self.command_queue,
                                        **impl.browser_kwargs)
             browser.setup()
@@ -508,7 +509,7 @@
                                           browser,
                                           self.command_queue,
                                           no_timeout=self.debug_info is not None)
-            self.test_type = self.state.test_type
+            self.test_implementation_key = (self.state.subsuite, self.state.test_type)
 
         assert self.browser is not None
         self.browser.update_settings(self.state.test)
@@ -527,7 +528,7 @@
         assert self.command_queue is not None
         assert self.remote_queue is not None
         self.logger.info("Starting runner")
-        impl = self.test_implementation_by_type[self.state.test_type]
+        impl = self.test_implementations[(self.state.subsuite, self.state.test_type)]
         self.executor_cls = impl.executor_cls
         self.executor_kwargs = impl.executor_kwargs
         self.executor_kwargs["group_metadata"] = self.state.group_metadata
@@ -547,8 +548,7 @@
         mp = mpcontext.get_context()
         self.test_runner_proc = mp.Process(target=start_runner,
                                            args=args,
-                                           name="TestRunner-%s-%i" % (
-                                               self.test_type, self.manager_number))
+                                           name="TestRunner-%i" % self.manager_number)
         self.test_runner_proc.start()
         self.logger.debug("Test runner started")
         # Now we wait for either an init_succeeded event or an init_failed event
@@ -556,7 +556,8 @@
     def init_succeeded(self):
         assert isinstance(self.state, RunnerManagerState.initializing)
         self.browser.after_init()
-        return RunnerManagerState.running(self.state.test_type,
+        return RunnerManagerState.running(self.state.subsuite,
+                                          self.state.test_type,
                                           self.state.test,
                                           self.state.test_group,
                                           self.state.group_metadata)
@@ -566,7 +567,8 @@
         self.browser.check_crash(None)
         self.browser.after_init()
         self.stop_runner(force=True)
-        return RunnerManagerState.initializing(self.state.test_type,
+        return RunnerManagerState.initializing(self.state.subsuite,
+                                               self.state.test_type,
                                                self.state.test,
                                                self.state.test_group,
                                                self.state.group_metadata,
@@ -578,13 +580,13 @@
         test_group = None
         while test is None:
             while test_group is None or len(test_group) == 0:
-                test_group, test_type, group_metadata = self.test_source.group()
+                test_group, subsuite, test_type, group_metadata = self.test_source.group()
                 if test_group is None:
                     self.logger.info("No more tests")
-                    return None, None, None, None
+                    return None, None, None, None, None
             test = test_group.popleft()
         self.run_count = 0
-        return test_type, test, test_group, group_metadata
+        return subsuite, test_type, test, test_group, group_metadata
 
     def run_test(self):
         assert isinstance(self.state, RunnerManagerState.running)
@@ -592,14 +594,15 @@
 
         if self.browser.update_settings(self.state.test):
             self.logger.info("Restarting browser for new test environment")
-            return RunnerManagerState.restarting(self.state.test_type,
+            return RunnerManagerState.restarting(self.state.subsuite,
+                                                 self.state.test_type,
                                                  self.state.test,
                                                  self.state.test_group,
                                                  self.state.group_metadata,
                                                  False)
 
         self.recording.set(["testrunner", "test"] + self.state.test.id.split("/")[1:])
-        self.logger.test_start(self.state.test.id)
+        self.logger.test_start(self.state.test.id, subsuite=self.state.subsuite)
         if self.rerun > 1:
             self.logger.info("Run %d/%d" % (self.run_count, self.rerun))
             self.send_message("reset")
@@ -670,7 +673,8 @@
                                     message=result.message,
                                     expected=expected,
                                     known_intermittent=known_intermittent,
-                                    stack=result.stack)
+                                    stack=result.stack,
+                                    subsuite=self.state.subsuite)
 
         expected = test.expected()
         known_intermittent = test.known_intermittent()
@@ -697,9 +701,6 @@
         is_unexpected = expected != status and status not in known_intermittent
         is_pass_or_expected = status in ["OK", "PASS"] or (not is_unexpected)
 
-        if is_unexpected or subtest_unexpected:
-            self.unexpected_tests.add(test.id)
-
         # A result is unexpected pass if the test or any subtest run
         # unexpectedly, and the overall status is expected or passing (OK for test
         # harness test, or PASS for reftest), and all unexpected results for
@@ -707,7 +708,9 @@
         is_unexpected_pass = ((is_unexpected or subtest_unexpected) and
                               is_pass_or_expected and subtest_all_pass_or_expected)
         if is_unexpected_pass:
-            self.unexpected_pass_tests.add(test.id)
+            self.unexpected_pass_tests[self.state.subsuite, test.test_type].append(test)
+        elif is_unexpected or subtest_unexpected:
+            self.unexpected_fail_tests[self.state.subsuite, test.test_type].append(test)
 
         if "assertion_count" in file_result.extra:
             assertion_count = file_result.extra["assertion_count"]
@@ -725,7 +728,8 @@
                              expected=expected,
                              known_intermittent=known_intermittent,
                              extra=file_result.extra,
-                             stack=file_result.stack)
+                             stack=file_result.stack,
+                             subsuite=self.state.subsuite)
 
         restart_before_next = (test.restart_after or
                                file_result.status in ("CRASH", "EXTERNAL-TIMEOUT", "INTERNAL-ERROR") or
@@ -756,7 +760,7 @@
         # that as long as we've done at least the automatic run count in total we can
         # continue with the next test.
         if not force_rerun and self.run_count >= self.rerun:
-            test_type, test, test_group, group_metadata = self.get_next_test()
+            subsuite, test_type, test, test_group, group_metadata = self.get_next_test()
             if test is None:
                 return RunnerManagerState.stop(force_stop)
             if test_type != self.state.test_type:
@@ -772,20 +776,20 @@
 
         if restart:
             return RunnerManagerState.restarting(
-                test_type, test, test_group, group_metadata, force_stop)
+                subsuite, test_type, test, test_group, group_metadata, force_stop)
         else:
             return RunnerManagerState.running(
-                test_type, test, test_group, group_metadata)
+                subsuite, test_type, test, test_group, group_metadata)
 
     def restart_runner(self):
         """Stop and restart the TestRunner"""
         assert isinstance(self.state, RunnerManagerState.restarting)
         self.stop_runner(force=self.state.force_stop)
         return RunnerManagerState.initializing(
-            self.state.test_type, self.state.test,
+            self.state.subsuite, self.state.test_type, self.state.test,
             self.state.test_group, self.state.group_metadata, 0)
 
-    def log(self, data):
+    def log(self, data: Mapping[str, Any]) -> None:
         self.logger.log_raw(data)
 
     def error(self, message):
@@ -821,9 +825,7 @@
             return
 
         self.browser.stop(force=True)
-        self.logger.debug("waiting for runner process to end")
         self.test_runner_proc.join(10)
-        self.logger.debug("After join")
         mp = mpcontext.get_context()
         if self.test_runner_proc.is_alive():
             # This might leak a file handle from the queue
@@ -886,8 +888,8 @@
                 break
 
 
-def make_test_queue(tests, test_source_cls, **test_source_kwargs):
-    queue, num_of_workers = test_source_cls.make_queue(tests, **test_source_kwargs)
+def make_test_queue(tests, test_source):
+    queue, num_of_workers = test_source.cls.make_queue(tests, **test_source.kwargs)
 
     # There is a race condition that means sometimes we continue
     # before the tests have been written to the underlying pipe.
@@ -899,8 +901,7 @@
 
 class ManagerGroup:
     """Main thread object that owns all the TestRunnerManager threads."""
-    def __init__(self, suite_name, test_source_cls, test_source_kwargs,
-                 test_implementation_by_type,
+    def __init__(self, suite_name, test_source, test_implementations,
                  rerun=1,
                  pause_after_test=False,
                  pause_on_unexpected=False,
@@ -911,9 +912,8 @@
                  recording=None,
                  max_restarts=5):
         self.suite_name = suite_name
-        self.test_source_cls = test_source_cls
-        self.test_source_kwargs = test_source_kwargs
-        self.test_implementation_by_type = test_implementation_by_type
+        self.test_source = test_source
+        self.test_implementations = test_implementations
         self.pause_after_test = pause_after_test
         self.pause_on_unexpected = pause_on_unexpected
         self.restart_on_unexpected = restart_on_unexpected
@@ -940,15 +940,15 @@
     def run(self, tests):
         """Start all managers in the group"""
 
-        test_queue, size = make_test_queue(tests, self.test_source_cls, **self.test_source_kwargs)
+        test_queue, size = make_test_queue(tests, self.test_source)
         self.logger.info("Using %i child processes" % size)
 
         for idx in range(size):
             manager = TestRunnerManager(self.suite_name,
                                         idx,
                                         test_queue,
-                                        self.test_source_cls,
-                                        self.test_implementation_by_type,
+                                        self.test_source.cls,
+                                        self.test_implementations,
                                         self.stop_flag,
                                         self.rerun,
                                         self.pause_after_test,
@@ -986,8 +986,16 @@
     def test_count(self):
         return sum(manager.test_count for manager in self.pool)
 
-    def unexpected_tests(self):
-        return set().union(*(manager.unexpected_tests for manager in self.pool))
+    def unexpected_fail_tests(self):
+        rv = defaultdict(list)
+        for manager in self.pool:
+            for (subsuite, test_type), tests in manager.unexpected_fail_tests.items():
+                rv[(subsuite, test_type)].extend(tests)
+        return rv
 
     def unexpected_pass_tests(self):
-        return set().union(*(manager.unexpected_pass_tests for manager in self.pool))
+        rv = defaultdict(list)
+        for manager in self.pool:
+            for (subsuite, test_type), tests in manager.unexpected_pass_tests.items():
+                rv[(subsuite, test_type)].extend(tests)
+        return rv
diff --git a/tools/wptrunner/wptrunner/tests/browsers/test_webkitgtk.py b/tools/wptrunner/wptrunner/tests/browsers/test_webkitgtk.py
index 5b751e9..324142c 100644
--- a/tools/wptrunner/wptrunner/tests/browsers/test_webkitgtk.py
+++ b/tools/wptrunner/wptrunner/tests/browsers/test_webkitgtk.py
@@ -7,7 +7,7 @@
 
 from wptserve.config import ConfigBuilder
 from ..base import active_products
-from wptrunner import environment, products
+from wptrunner import environment, products, testloader
 
 test_paths = {"/": {"tests_path": join(dirname(__file__), "..", "..", "..", "..", "..")}}  # repo root
 environment.do_delayed_imports(None, test_paths)
@@ -44,6 +44,7 @@
     kwargs["pause_after_test"] = False
     kwargs["pause_on_unexpected"] = False
     kwargs["debug_test"] = False
+    kwargs["subsuite"] = testloader.Subsuite("", config={})
     with ConfigBuilder(logger,
                        browser_host="example.net",
                        alternate_hosts={"alt": "example.org"},
diff --git a/tools/wptrunner/wptrunner/tests/test_testloader.py b/tools/wptrunner/wptrunner/tests/test_testloader.py
index 6e93a20..0915f42 100644
--- a/tools/wptrunner/wptrunner/tests/test_testloader.py
+++ b/tools/wptrunner/wptrunner/tests/test_testloader.py
@@ -11,6 +11,7 @@
     DirectoryHashChunker,
     IDHashChunker,
     PathHashChunker,
+    Subsuite,
     TestFilter,
     TestLoader,
     TagFilter,
@@ -79,20 +80,22 @@
         "version": 8,
     }
     manifest = WPTManifest.from_json("/", manifest_json)
+    subsuites = {}
+    subsuites[""] = Subsuite("", config={})
 
     # By default, the loader should include the h2 test.
-    loader = TestLoader({manifest: {"metadata_path": ""}}, ["testharness"], None)
-    assert "testharness" in loader.tests
-    assert len(loader.tests["testharness"]) == 2
-    assert len(loader.disabled_tests) == 0
+    loader = TestLoader({manifest: {"metadata_path": ""}}, ["testharness"], None, subsuites)
+    assert "testharness" in loader.tests[""]
+    assert len(loader.tests[""]["testharness"]) == 2
+    assert len(loader.disabled_tests[""]) == 0
 
     # We can also instruct it to skip them.
-    loader = TestLoader({manifest: {"metadata_path": ""}}, ["testharness"], None, include_h2=False)
-    assert "testharness" in loader.tests
-    assert len(loader.tests["testharness"]) == 1
-    assert "testharness" in loader.disabled_tests
-    assert len(loader.disabled_tests["testharness"]) == 1
-    assert loader.disabled_tests["testharness"][0].url == "/a/bar.h2.html"
+    loader = TestLoader({manifest: {"metadata_path": ""}}, ["testharness"], None, subsuites, include_h2=False)
+    assert "testharness" in loader.tests[""]
+    assert len(loader.tests[""]["testharness"]) == 1
+    assert "testharness" in loader.disabled_tests[""]
+    assert len(loader.disabled_tests[""]["testharness"]) == 1
+    assert loader.disabled_tests[""]["testharness"][0].url == "/a/bar.h2.html"
 
 
 @pytest.mark.xfail(sys.platform == "win32",
@@ -238,6 +241,9 @@
         with open(os.path.join(a_path, "bar.html.ini"), "w") as f:
             f.write("tags: [test-include]\n")
 
+        subsuites = {}
+        subsuites[""] = Subsuite("", config={})
+
         b_path = os.path.join(metadata_path, "b")
         os.makedirs(b_path)
         with open(os.path.join(b_path, "baz.html.ini"), "w") as f:
@@ -245,40 +251,40 @@
 
 
         # Check: no filter loads all tests
-        loader = TestLoader({manifest: {"metadata_path": metadata_path}}, ["testharness"], None)
-        assert len(loader.tests["testharness"]) == 4
+        loader = TestLoader({manifest: {"metadata_path": metadata_path}}, ["testharness"], None, subsuites)
+        assert len(loader.tests[""]["testharness"]) == 4
 
         # Check: specifying a single `test-include` inclusion yields `/a/bar` and `/b/baz`
-        loader = TestLoader({manifest: {"metadata_path": metadata_path}}, ["testharness"], None,
+        loader = TestLoader({manifest: {"metadata_path": metadata_path}}, ["testharness"], None, subsuites,
                             test_filters=[TagFilter({"test-include"}, {})])
-        assert len(loader.tests["testharness"]) == 2
-        assert loader.tests["testharness"][0].id == "/a/bar.html"
-        assert loader.tests["testharness"][0].tags == {"dir:a", "test-include"}
-        assert loader.tests["testharness"][1].id == "/b/baz.html"
-        assert loader.tests["testharness"][1].tags == {"dir:b", "test-include", "test-exclude"}
+        assert len(loader.tests[""]["testharness"]) == 2
+        assert loader.tests[""]["testharness"][0].id == "/a/bar.html"
+        assert loader.tests[""]["testharness"][0].tags == {"dir:a", "test-include"}
+        assert loader.tests[""]["testharness"][1].id == "/b/baz.html"
+        assert loader.tests[""]["testharness"][1].tags == {"dir:b", "test-include", "test-exclude"}
 
         # Check: specifying a single `test-exclude` exclusion rejects only `/b/baz`
-        loader = TestLoader({manifest: {"metadata_path": metadata_path}}, ["testharness"], None,
+        loader = TestLoader({manifest: {"metadata_path": metadata_path}}, ["testharness"], None, subsuites,
                             test_filters=[TagFilter({}, {"test-exclude"})])
-        assert len(loader.tests["testharness"]) == 3
-        assert all(test.id != "/b/baz.html" for test in loader.tests["testharness"])
+        assert len(loader.tests[""]["testharness"]) == 3
+        assert all(test.id != "/b/baz.html" for test in loader.tests[""]["testharness"])
 
         # Check: including `test-include` and excluding `test-exclude` yields only `/a/bar`
-        loader = TestLoader({manifest: {"metadata_path": metadata_path}}, ["testharness"], None,
+        loader = TestLoader({manifest: {"metadata_path": metadata_path}}, ["testharness"], None, subsuites,
                             test_filters=[TagFilter({"test-include"}, {"test-exclude"})])
-        assert len(loader.tests["testharness"]) == 1
-        assert loader.tests["testharness"][0].id == "/a/bar.html"
-        assert loader.tests["testharness"][0].tags == {"dir:a", "test-include"}
+        assert len(loader.tests[""]["testharness"]) == 1
+        assert loader.tests[""]["testharness"][0].id == "/a/bar.html"
+        assert loader.tests[""]["testharness"][0].tags == {"dir:a", "test-include"}
 
         # Check: non-empty intersection of inclusion and exclusion yield zero tests
 
-        loader = TestLoader({manifest: {"metadata_path": metadata_path}}, ["testharness"], None,
+        loader = TestLoader({manifest: {"metadata_path": metadata_path}}, ["testharness"], None, subsuites,
                             test_filters=[TagFilter({"test-include"}, {"test-include"})])
-        assert len(loader.tests["testharness"]) == 0
+        assert len(loader.tests[""]["testharness"]) == 0
 
-        loader = TestLoader({manifest: {"metadata_path": metadata_path}}, ["testharness"], None,
+        loader = TestLoader({manifest: {"metadata_path": metadata_path}}, ["testharness"], None, subsuites,
                             test_filters=[TagFilter({"test-include", "test-exclude"}, {"test-include"})])
-        assert len(loader.tests["testharness"]) == 0
+        assert len(loader.tests[""]["testharness"]) == 0
 
 
 def test_chunk_hash(manifest):
diff --git a/tools/wptrunner/wptrunner/update/metadata.py b/tools/wptrunner/wptrunner/update/metadata.py
index 388b569..4851990 100644
--- a/tools/wptrunner/wptrunner/update/metadata.py
+++ b/tools/wptrunner/wptrunner/update/metadata.py
@@ -11,7 +11,7 @@
     provides = ["update_properties"]
 
     def create(self, state):
-        state.update_properties = products.load_product_update(state.config, state.product)
+        state.update_properties = products.load_product_update(state.config, state.product.name)
 
 
 class UpdateExpected(Step):
diff --git a/tools/wptrunner/wptrunner/wptcommandline.py b/tools/wptrunner/wptrunner/wptcommandline.py
index bc664b0..a1eb078 100644
--- a/tools/wptrunner/wptrunner/wptcommandline.py
+++ b/tools/wptrunner/wptrunner/wptcommandline.py
@@ -8,6 +8,7 @@
 from datetime import timedelta
 
 from . import config
+from . import products
 from . import wpttest
 from .formatters import chromium, wptreport, wptscreenshot
 
@@ -37,8 +38,6 @@
 def create_parser(product_choices=None):
     from mozlog import commandline
 
-    from . import products
-
     if product_choices is None:
         product_choices = products.product_list
 
@@ -142,6 +141,11 @@
                                       nargs="*", default=wpttest.enabled_tests,
                                       choices=wpttest.enabled_tests,
                                       help="Test types to run")
+    test_selection_group.add_argument("--subsuite-file", action="store",
+                                      help="Path to JSON file containing subsuite configuration")
+    # TODO use an empty string argument for the default subsuite
+    test_selection_group.add_argument("--subsuite", action="append", dest="subsuites",
+                                      help="Subsuite names to run. Runs all subsuites when omitted.")
     test_selection_group.add_argument("--include", action="append",
                                       help="URL prefix to include")
     test_selection_group.add_argument("--include-file", action="store",
@@ -456,6 +460,8 @@
 
     kwargs["config"] = config.read(kwargs["config_path"])
 
+    kwargs["product"] = products.Product(kwargs["config"], kwargs["product"])
+
     keys = {"paths": [("prefs", "prefs_root", True),
                       ("run_info", "run_info", True)],
             "web-platform-tests": [("remote_url", "remote_url", False),
@@ -553,13 +559,10 @@
 def check_args(kwargs):
     set_from_config(kwargs)
 
-    if kwargs["product"] is None:
-        kwargs["product"] = "firefox"
-
     if kwargs["manifest_update"] is None:
         kwargs["manifest_update"] = True
 
-    if "sauce" in kwargs["product"]:
+    if "sauce" in kwargs["product"].name:
         kwargs["pause_after_test"] = False
 
     if kwargs["test_list"]:
@@ -642,7 +645,7 @@
             sys.exit(1)
         kwargs["openssl_binary"] = path
 
-    if kwargs["ssl_type"] != "none" and kwargs["product"] == "firefox" and kwargs["certutil_binary"]:
+    if kwargs["ssl_type"] != "none" and kwargs["product"].name == "firefox" and kwargs["certutil_binary"]:
         path = exe_path(kwargs["certutil_binary"])
         if path is None:
             print("certutil-binary argument missing or not a valid executable", file=sys.stderr)
@@ -678,9 +681,6 @@
 def check_args_metadata_update(kwargs):
     set_from_config(kwargs)
 
-    if kwargs["product"] is None:
-        kwargs["product"] = "firefox"
-
     for item in kwargs["run_log"]:
         if os.path.isdir(item):
             print("Log file %s is a directory" % item, file=sys.stderr)
@@ -716,7 +716,7 @@
                                      description="Update script for web-platform-tests tests.")
     # This will be removed once all consumers are updated to the properties-file based system
     parser.add_argument("--product", action="store", choices=product_choices,
-                        default=None, help=argparse.SUPPRESS)
+                        default="firefox", help=argparse.SUPPRESS)
     parser.add_argument("--config", action="store", type=abs_path, help="Path to config file")
     parser.add_argument("--metadata", action="store", type=abs_path, dest="metadata_root",
                         help="Path to the folder containing test metadata"),
diff --git a/tools/wptrunner/wptrunner/wptrunner.py b/tools/wptrunner/wptrunner/wptrunner.py
index 4bfb415..7bf473e 100644
--- a/tools/wptrunner/wptrunner/wptrunner.py
+++ b/tools/wptrunner/wptrunner/wptrunner.py
@@ -13,7 +13,6 @@
 from . import environment as env
 from . import instruments
 from . import mpcontext
-from . import products
 from . import testloader
 from . import wptcommandline
 from . import wptlogging
@@ -48,19 +47,28 @@
     return logger
 
 
-def get_loader(test_paths, product, debug=None, run_info_extras=None, chunker_kwargs=None,
-               test_groups=None, **kwargs):
-    if run_info_extras is None:
-        run_info_extras = {}
+def get_loader(test_paths, product, **kwargs):
+    run_info_extras = product.run_info_extras(**kwargs)
+    base_run_info = wpttest.get_run_info(kwargs["run_info"],
+                                         product.name,
+                                         browser_version=kwargs.get("browser_version"),
+                                         browser_channel=kwargs.get("browser_channel"),
+                                         verify=kwargs.get("verify"),
+                                         debug=kwargs["debug"],
+                                         extras=run_info_extras,
+                                         device_serials=kwargs.get("device_serial"),
+                                         adb_binary=kwargs.get("adb_binary"))
 
-    run_info = wpttest.get_run_info(kwargs["run_info"], product,
-                                    browser_version=kwargs.get("browser_version"),
-                                    browser_channel=kwargs.get("browser_channel"),
-                                    verify=kwargs.get("verify"),
-                                    debug=debug,
-                                    extras=run_info_extras,
-                                    device_serials=kwargs.get("device_serial"),
-                                    adb_binary=kwargs.get("adb_binary"))
+    subsuites = testloader.load_subsuites(logger,
+                                          base_run_info,
+                                          kwargs["subsuite_file"],
+                                          set(kwargs["subsuites"] or []))
+
+    if kwargs["test_groups_file"] is not None:
+        test_groups = testloader.TestGroups(logger,
+                                            kwargs["test_groups_file"])
+    else:
+        test_groups = None
 
     test_manifests = testloader.ManifestLoader(test_paths,
                                                force_manifest_update=kwargs["manifest_update"],
@@ -88,9 +96,15 @@
 
     ssl_enabled = sslutils.get_cls(kwargs["ssl_type"]).ssl_enabled
     h2_enabled = wptserve.utils.http2_compatible()
-    test_loader = testloader.TestLoader(test_manifests,
-                                        kwargs["test_types"],
-                                        run_info,
+
+    test_source, chunker_kwargs = testloader.get_test_src(logger=logger,
+                                                          test_groups=test_groups,
+                                                          **kwargs)
+
+    test_loader = testloader.TestLoader(test_manifests=test_manifests,
+                                        test_types=kwargs["test_types"],
+                                        base_run_info=base_run_info,
+                                        subsuites=subsuites,
                                         manifest_filters=manifest_filters,
                                         test_filters=test_filters,
                                         chunk_type=kwargs["chunk_type"],
@@ -102,16 +116,15 @@
                                         skip_timeout=kwargs["skip_timeout"],
                                         skip_implementation_status=kwargs["skip_implementation_status"],
                                         chunker_kwargs=chunker_kwargs)
-    return run_info, test_loader
+    return test_source, test_loader
 
 
 def list_test_groups(test_paths, product, **kwargs):
     env.do_delayed_imports(logger, test_paths)
 
-    run_info_extras = products.Product(kwargs["config"], product).run_info_extras(**kwargs)
-
-    run_info, test_loader = get_loader(test_paths, product,
-                                       run_info_extras=run_info_extras, **kwargs)
+    _, test_loader = get_loader(test_paths,
+                                product,
+                                **kwargs)
 
     for item in sorted(test_loader.groups(kwargs["test_types"])):
         print(item)
@@ -122,10 +135,7 @@
 
     rv = []
 
-    run_info_extras = products.Product(kwargs["config"], product).run_info_extras(**kwargs)
-
-    run_info, test_loader = get_loader(test_paths, product,
-                                       run_info_extras=run_info_extras, **kwargs)
+    _, test_loader = get_loader(test_paths, product, **kwargs)
 
     for test_type, tests in test_loader.disabled_tests.items():
         for test in tests:
@@ -136,10 +146,7 @@
 def list_tests(test_paths, product, **kwargs):
     env.do_delayed_imports(logger, test_paths)
 
-    run_info_extras = products.Product(kwargs["config"], product).run_info_extras(**kwargs)
-
-    run_info, test_loader = get_loader(test_paths, product,
-                                       run_info_extras=run_info_extras, **kwargs)
+    _, test_loader = get_loader(test_paths, product, **kwargs)
 
     for test in test_loader.test_ids:
         print(test)
@@ -162,107 +169,118 @@
     return kwargs["pause_after_test"]
 
 
-def run_test_iteration(test_status, test_loader, test_source_kwargs, test_source_cls, run_info,
+def log_suite_start(tests_by_group, base_run_info, subsuites, run_by_dir):
+    logger.suite_start(tests_by_group,
+                       name='web-platform-test',
+                       run_info=base_run_info,
+                       extra={"run_by_dir": run_by_dir})
+
+    for name, subsuite in subsuites.items():
+        logger.add_subsuite(name=name, run_info=subsuite.run_info_extras)
+
+
+def run_test_iteration(test_status, test_loader, test_source,
                        recording, test_environment, product, kwargs):
     """Runs the entire test suite.
     This is called for each repeat run requested."""
     tests_by_type = defaultdict(list)
+
     for test_type in test_loader.test_types:
-        type_tests_active = test_loader.tests[test_type]
-        type_tests_disabled = test_loader.disabled_tests[test_type]
-        if type_tests_active or type_tests_disabled:
-            tests_by_type[test_type].extend(type_tests_active)
-            tests_by_type[test_type].extend(type_tests_disabled)
+        for subsuite_name, subsuite in test_loader.subsuites.items():
+            type_tests_active = test_loader.tests[subsuite_name][test_type]
+            type_tests_disabled = test_loader.disabled_tests[subsuite_name][test_type]
+            if type_tests_active or type_tests_disabled:
+                tests_by_type[(subsuite_name, test_type)].extend(type_tests_active)
+                tests_by_type[(subsuite_name, test_type)].extend(type_tests_disabled)
 
-    try:
-        test_groups = test_source_cls.tests_by_group(tests_by_type, **test_source_kwargs)
-    except Exception:
-        logger.critical("Loading tests failed")
-        return False
+    tests_by_group = test_source.cls.tests_by_group(tests_by_type, **kwargs)
 
-    logger.suite_start(tests_by_type,
-                       name='web-platform-test',
-                       run_info=run_info,
-                       extra={"run_by_dir": kwargs["run_by_dir"]})
+    log_suite_start(tests_by_group,
+                    test_loader.base_run_info,
+                    test_loader.subsuites,
+                    kwargs["run_by_dir"])
 
-    test_implementation_by_type = {}
+    test_implementations = {}
+    tests_to_run = defaultdict(list)
 
-    for test_type in kwargs["test_types"]:
-        if test_type not in tests_by_type:
-            continue
+    for test_type in test_loader.test_types:
         executor_cls = product.executor_classes.get(test_type)
         if executor_cls is None:
             logger.warning(f"Unsupported test type {test_type} for product {product.name}")
             continue
-        executor_kwargs = product.get_executor_kwargs(logger,
-                                                      test_type,
-                                                      test_environment,
-                                                      run_info,
-                                                      **kwargs)
         browser_cls = product.get_browser_cls(test_type)
-        browser_kwargs = product.get_browser_kwargs(logger,
-                                                    test_type,
-                                                    run_info,
-                                                    config=test_environment.config,
-                                                    num_test_groups=len(test_groups),
-                                                    **kwargs)
-        test_implementation_by_type[test_type] = TestImplementation(executor_cls,
-                                                                    executor_kwargs,
-                                                                    browser_cls,
-                                                                    browser_kwargs)
 
-    tests_to_run = {}
-    for test_type, test_implementation in test_implementation_by_type.items():
-        executor_cls = test_implementation.executor_cls
+        for subsuite_name, subsuite in test_loader.subsuites.items():
+            if (subsuite_name, test_type) not in tests_by_type:
+                continue
+            run_info = subsuite.run_info
+            executor_kwargs = product.get_executor_kwargs(logger,
+                                                          test_type,
+                                                          test_environment,
+                                                          run_info,
+                                                          subsuite=subsuite,
+                                                          **kwargs)
+            browser_kwargs = product.get_browser_kwargs(logger,
+                                                        test_type,
+                                                        run_info,
+                                                        config=test_environment.config,
+                                                        num_test_groups=len(tests_by_group),
+                                                        subsuite=subsuite,
+                                                        **kwargs)
 
-        for test in test_loader.disabled_tests[test_type]:
-            logger.test_start(test.id)
-            logger.test_end(test.id, status="SKIP")
-            test_status.skipped += 1
+            test_implementations[(subsuite_name, test_type)] = TestImplementation(executor_cls,
+                                                                                  executor_kwargs,
+                                                                                  browser_cls,
+                                                                                  browser_kwargs)
 
-        if test_type == "testharness":
-            tests_to_run[test_type] = []
-            for test in test_loader.tests[test_type]:
-                skip_reason = None
-                if test.testdriver and not executor_cls.supports_testdriver:
-                    skip_reason = "Executor does not support testdriver.js"
-                elif test.jsshell and not executor_cls.supports_jsshell:
-                    skip_reason = "Executor does not support jsshell"
-                if skip_reason:
-                    logger.test_start(test.id)
-                    logger.test_end(test.id, status="SKIP", message=skip_reason)
-                    test_status.skipped += 1
-                else:
-                    tests_to_run[test_type].append(test)
-        else:
-            tests_to_run[test_type] = test_loader.tests[test_type]
+            for test in test_loader.disabled_tests[subsuite_name][test_type]:
+                logger.test_start(test.id, subsuite=subsuite_name)
+                logger.test_end(test.id, status="SKIP", subsuite=subsuite_name)
+                test_status.skipped += 1
 
-    unexpected_tests = set()
-    unexpected_pass_tests = set()
+            if test_type == "testharness":
+                for test in test_loader.tests[subsuite_name][test_type]:
+                    skip_reason = None
+                    if test.testdriver and not executor_cls.supports_testdriver:
+                        skip_reason = "Executor does not support testdriver.js"
+                    elif test.jsshell and not executor_cls.supports_jsshell:
+                        skip_reason = "Executor does not support jsshell"
+                    if skip_reason:
+                        logger.test_start(test.id, subsuite=subsuite_name)
+                        logger.test_end(test.id,
+                                        status="SKIP",
+                                        subsuite=subsuite_name,
+                                        message=skip_reason)
+                        test_status.skipped += 1
+                    else:
+                        tests_to_run[(subsuite_name, test_type)].append(test)
+            else:
+                tests_to_run[(subsuite_name, test_type)] = test_loader.tests[subsuite_name][test_type]
+
+    unexpected_fail_tests = defaultdict(list)
+    unexpected_pass_tests = defaultdict(list)
     recording.pause()
     retry_counts = kwargs["retry_unexpected"]
     for i in range(retry_counts + 1):
         if i > 0:
-            if not kwargs["fail_on_unexpected_pass"]:
-                unexpected_fail_tests = unexpected_tests - unexpected_pass_tests
-            else:
-                unexpected_fail_tests = unexpected_tests
-            if len(unexpected_fail_tests) == 0:
+            if kwargs["fail_on_unexpected_pass"]:
+                for (subtests, test_type), tests in unexpected_pass_tests.items():
+                    unexpected_fail_tests[(subtests, test_type)].extend(tests)
+            tests_to_run = unexpected_fail_tests
+            if sum(len(tests) for tests in tests_to_run.values()) == 0:
                 break
-            for test_type, tests in tests_to_run.items():
-                tests_to_run[test_type] = [test for test in tests
-                                           if test.id in unexpected_fail_tests]
+            tests_by_group = test_source.cls.tests_by_group(tests_to_run, **kwargs)
 
             logger.suite_end()
-            logger.suite_start(tests_to_run,
-                               name='web-platform-test',
-                               run_info=run_info,
-                               extra={"run_by_dir": kwargs["run_by_dir"]})
+
+            log_suite_start(tests_by_group,
+                            test_loader.base_run_info,
+                            test_loader.subsuites,
+                            kwargs["run_by_dir"])
 
         with ManagerGroup("web-platform-tests",
-                          test_source_cls,
-                          test_source_kwargs,
-                          test_implementation_by_type,
+                          test_source,
+                          test_implementations,
                           kwargs["rerun"],
                           kwargs["pause_after_test"],
                           kwargs["pause_on_unexpected"],
@@ -285,11 +303,12 @@
                 raise
 
             test_status.total_tests += manager_group.test_count()
-            unexpected_tests = manager_group.unexpected_tests()
+            unexpected_fail_tests = manager_group.unexpected_fail_tests()
             unexpected_pass_tests = manager_group.unexpected_pass_tests()
 
-    test_status.unexpected += len(unexpected_tests)
-    test_status.unexpected_pass += len(unexpected_pass_tests)
+    test_status.unexpected_pass += sum(len(tests) for tests in unexpected_pass_tests.values())
+    test_status.unexpected += sum(len(tests) for tests in unexpected_pass_tests.values())
+    test_status.unexpected += sum(len(tests) for tests in unexpected_fail_tests.values())
     logger.suite_end()
     return True
 
@@ -342,7 +361,7 @@
         self.all_skipped = False
 
 
-def run_tests(config, test_paths, product, **kwargs):
+def run_tests(config, product, test_paths, **kwargs):
     """Set up the test environment, load the list of tests to be executed, and
     invoke the remainder of the code to execute tests"""
     mp = mpcontext.get_context()
@@ -356,8 +375,6 @@
         recording.set(["startup"])
         env.do_delayed_imports(logger, test_paths)
 
-        product = products.Product(config, product)
-
         env_extras = product.get_env_extras(**kwargs)
 
         product.check_args(**kwargs)
@@ -371,20 +388,9 @@
 
         recording.set(["startup", "load_tests"])
 
-        test_groups = (testloader.TestGroupsFile(logger, kwargs["test_groups_file"])
-                       if kwargs["test_groups_file"] else None)
-
-        (test_source_cls,
-         test_source_kwargs,
-         chunker_kwargs) = testloader.get_test_src(logger=logger,
-                                                   test_groups=test_groups,
-                                                   **kwargs)
-        run_info, test_loader = get_loader(test_paths,
-                                           product.name,
-                                           run_info_extras=product.run_info_extras(**kwargs),
-                                           chunker_kwargs=chunker_kwargs,
-                                           test_groups=test_groups,
-                                           **kwargs)
+        test_source, test_loader = get_loader(test_paths,
+                                              product,
+                                              **kwargs)
 
         test_status = TestStatus()
         repeat = kwargs["repeat"]
@@ -404,8 +410,9 @@
                                        "host_cert_path": kwargs["host_cert_path"],
                                        "ca_cert_path": kwargs["ca_cert_path"]}}
 
+        # testharness.js is global so we can't set the timeout multiplier in that file by subsuite
         testharness_timeout_multipler = product.get_timeout_multiplier("testharness",
-                                                                       run_info,
+                                                                       test_loader.base_run_info,
                                                                        **kwargs)
 
         mojojs_path = kwargs["mojojs_path"] if kwargs["enable_mojojs"] else None
@@ -462,8 +469,8 @@
                 elif repeat > 1:
                     logger.info(f"Repetition {test_status.repeated_runs} / {repeat}")
 
-                iter_success = run_test_iteration(test_status, test_loader, test_source_kwargs,
-                                                  test_source_cls, run_info, recording,
+                iter_success = run_test_iteration(test_status, test_loader, test_source,
+                                                  recording,
                                                   test_environment, product, kwargs)
                 # if there were issues with the suite run(tests not loaded, etc.) return
                 if not iter_success:
diff --git a/tools/wptrunner/wptrunner/wpttest.py b/tools/wptrunner/wptrunner/wpttest.py
index 492549d..ba038a2 100644
--- a/tools/wptrunner/wptrunner/wpttest.py
+++ b/tools/wptrunner/wptrunner/wpttest.py
@@ -86,7 +86,7 @@
 
 
 class RunInfo(Dict[str, Any]):
-    def __init__(self, metadata_root, product, debug,
+    def __init__(self, metadata_root, product_name, debug,
                  browser_version=None,
                  browser_channel=None,
                  verify=None,
@@ -107,7 +107,7 @@
             self["revision"] = rev.decode("utf-8")
 
         self["python_version"] = sys.version_info.major
-        self["product"] = product
+        self["product"] = product_name
         if debug is not None:
             self["debug"] = debug
         elif "debug" not in self: