blob: 2349d2dc447bf2e960c79bceb393904aefa924e9 [file] [log] [blame]
# Copyright 2018 The Chromium Authors. All rights reserved.
# Use of this source code is governed by a BSD-style license that can be
# found in the LICENSE file.
"""This module is to process the code coverage metadata."""
import collections
import json
import logging
import re
import zlib
from google.appengine.ext import ndb
from google.protobuf.field_mask_pb2 import FieldMask
from google.protobuf import json_format
from common.findit_http_client import FinditHttpClient
from common.waterfall.buildbucket_client import GetV2Build
from gae_libs.caches import PickledMemCache
from gae_libs.handlers.base_handler import BaseHandler, Permission
from gae_libs.gitiles.cached_gitiles_repository import CachedGitilesRepository
from libs.cache_decorator import Cached
from model.proto.gen.code_coverage_pb2 import CoverageReport
from model.code_coverage import CoverageData
from model.code_coverage import PostsubmitReport
from model.code_coverage import PresubmitReport
from waterfall import waterfall_config
# List of Gerrit projects that Findit supports.
_PROJECTS_WHITELIST = set(['chromium/src'])
# The regex to extract the build id from the url path.
_BUILD_ID_REGEX = re.compile(r'.*/build/(\d+)$')
# Url regex to extract:
# * postsubmit: the server host, project name, revision, and file.
# * presubmit: the server host, change, patchset, build_id and file.
# These patterns are used so that the html coverage report generated by coverage
# tools could load resources files like style.css etc via relative paths.
_URL_REGEX_PRESUBMIT = re.compile(
def _GetValidatedData(gs_url):
"""Returns the json data from the given GS url after validation.
json_data (dict): the json data of the file pointed by the given GS url, or
None if the data can't be retrieved.
"""'Fetching %s', gs_url)
status, content, _ = FinditHttpClient().Get(gs_url)
assert status == 200, 'Can not retrieve the data: %s' % gs_url'Decompressing and loading coverage data...')
decompressed_data = zlib.decompress(content)
del content # Explicitly release memory.
data = json.loads(decompressed_data)
del decompressed_data # Explicitly release memory.'Finished decompressing and loading coverage data.')
# Validate that the data is in good format.'Validating coverage data...')
report = CoverageReport()
json_format.ParseDict(data, report, ignore_unknown_fields=True)
del report # Explicitly delete the proto message to release memory.'Finished validating coverage data.')
return data
def _DecompressLines(line_ranges):
"""Decompress the lines data to a flat format.
For example:
"count": 1,
"first": 165, // inclusive
"last": 166 // inclusive
After decompressing, it becomes:
"line": 165,
"count": 1
"line": 166,
"count": 1
line_ranges: A list of dict, with format
[{"first": int, "last": int, "count": int}, ...], and note that
the [first, last] are both inclusive.
A list of dict, with format
[{"line": int, "count": int}].
decompressed_lines = []
for line_range in line_ranges:
for line_num in range(line_range['first'], line_range['last'] + 1):
'line': line_num,
'count': line_range['count']
return decompressed_lines
class ProcessCodeCoverageData(BaseHandler): # pragma: no cover.
def _ProcessFullRepositoryData(self, commit, data, full_gs_dir, bucket_name,
source_and_report_gs_path, build_id):
# Load the commit log first so that we could fail fast before redo all.
repo_url = 'https://%s/%s' % (, commit.project)
change_log = CachedGitilesRepository(FinditHttpClient(),
assert change_log is not None, 'Failed to retrieve the commit log'
# Save the file-level, directory-level and line-level coverage data.
code_revision_index = '%s-%s' % (commit.project,
for data_type in ('dirs', 'components', 'files', 'file_shards'):
sub_data = data.get(data_type)
if not sub_data:
continue'Processing %d entries for %s', len(sub_data), data_type)
actual_data_type = data_type
if data_type == 'file_shards':
actual_data_type = 'files'
def FlushEntries(entries, total, last=False):
# Flush the data in a batch and release memory.
if len(entries) < 100 and not (last and entries):
return entries, total
total += len(entries)'Dumped %d CoverageData entries of type %s', total,
return [], total
def IterateOverFileShards(file_shards):
for file_path in file_shards:
url = '%s/%s' % (full_gs_dir, file_path)
# Download data one by one.
yield _GetValidatedData(url).get('files', [])
if data_type == 'file_shards':
data_iterator = IterateOverFileShards(sub_data)
data_iterator = [sub_data]
entities = []
total = 0
component_summaries = []
for dataset in data_iterator:
for group_data in dataset:
if actual_data_type == 'components':
'name': group_data['path'],
'path': group_data['path'],
'summaries': group_data['summaries'],
coverage_data = CoverageData.Create(, code_revision_index,
group_data['path'], group_data)
entities, total = FlushEntries(entities, total, last=False)
del dataset # Explicitly release memory.
FlushEntries(entities, total, last=True)
if component_summaries:
component_summaries.sort(key=lambda x: x['path'])
CoverageData.Create(, code_revision_index, 'components',
'>>', {
'dirs': component_summaries,
'path': '>>'
component_summaries = []'Summary of all components are saved to datastore.')
# Create a repository-level record so that it shows up on UI.
'%s$%s$%s' % (, commit.project,,,
gs_url='%s/index.html' % full_gs_dir,
def _ProcessCLPatchData(self, patch, data, full_gs_dir, bucket_name,
source_and_report_gs_path, build_id):
# For a CL/patch, we save the entire data in one entity.
CoverageData.Create(, '%s-%s' % (patch.change, patch.patchset),
'patch', 'ALL', data).put()
PresubmitReport, '%s$%s$%s$%s' % (, patch.change,
patch.patchset, build_id)),,
gs_url='%s/index.html' % full_gs_dir,
def _processCodeCoverageData(self, build_id):
build = GetV2Build(
fields=FieldMask(paths=['id', '', 'input', 'builder']))
if not build:
return BaseHandler.CreateError(
'Could not retrieve build #%d from buildbucket, retry' % build_id,
# Only process Chromium coverage bots.
if (build.builder.project != 'chromium' or
build.builder.bucket not in ('ci', 'try') or
build.builder.builder not in ('linux-code-coverage',
# Convert the Struct to standard dict, to use .get, .iteritems etc.
properties = dict(
gs_bucket = properties.get('coverage_gs_bucket')
gs_path = properties.get('coverage_metadata_gs_path')
source_and_report_gs_path = properties.get(
# Ensure that the coverage data is ready.
if not gs_bucket or not gs_path or not source_and_report_gs_path:'coverage GS bucket info not available in %r',
full_gs_dir = '' % (gs_bucket, gs_path)
gs_url = '%s/all.json.gz' % full_gs_dir
data = _GetValidatedData(gs_url)
# Save the data in json.
if build.builder.bucket == 'try':
# Assume there is only 1 patch which is true in CQ.
assert len(build.input.gerrit_changes) == 1, 'Expect only one patchset'
patch = build.input.gerrit_changes[0]
self._ProcessCLPatchData(patch, data['files'], full_gs_dir, gs_bucket,
source_and_report_gs_path, build_id)
else: # For a commit, we save the data by file and directory.
assert build.input.gitiles_commit is not None, 'Expect a commit'
self._ProcessFullRepositoryData(build.input.gitiles_commit, data,
full_gs_dir, gs_bucket,
source_and_report_gs_path, build_id)
def HandlePost(self):
"""Loads the data from GS bucket, and dumps them into ndb."""'Processing: %s', self.request.path)
match = _BUILD_ID_REGEX.match(self.request.path)
if not match:'Build id not found')
build_id = int(
return self._processCodeCoverageData(build_id)
def HandleGet(self):
return self.HandlePost() # For local testing purpose.
def _IsServePresubmitCoverageDataEnabled():
"""Returns True if the feature to serve presubmit coverage data is enabled.
Returns True if it is enabled, otherwise, False.
# Unless the flag is explicitly set, assuming disabled by default.
return waterfall_config.GetCodeCoverageSettings().get(
'serve_presubmit_coverage_data', False)
def _GetFileContentFromGS(report, file_path):
"""Returns the content of the given file in the given report."""
bucket_name = report.bucket_name
source_and_report_gs_path = report.source_and_report_gs_path
gs_url = '' % (
bucket_name, source_and_report_gs_path, file_path)
@Cached(PickledMemCache(), expire_time=48 * 60 * 60)
def RetrieveFile(url):'Fetching file %s', url)
status, content, _ = FinditHttpClient().Get(url)
if status == 404:
return None # This is not cached.
assert status == 200, 'Can not retrieve file: %s' % url
# Convert to unicode, because some file content is not. This is for jinja
# template to render the UI.
return unicode(content, 'utf8') # This is cached.
return RetrieveFile(gs_url)
def _GetPathRootAndSeparatorFromDataType(data_type):
"""Returns the path of the root and path separator for the given data type."""
if data_type in ('files', 'dirs'):
return '//', '/'
elif data_type == 'components':
return '>>', '>'
return None, None
def _GetNameToPathSeparator(path, data_type):
"""Returns a list of [name, sub_path] for the given path.
1. //root/src/ -> [
['root/', '//root/'],
['src/', '//root/src/'],
['', '//root/src/']
2. //root/src/path1/ -> [
['root/', '//root/'],
['src/', '//root/src/'],
['path1/', '//root/src/path1/']
3. component1>component2 -> [
['component1', 'component1'],
['component2', 'component1>component2'],
path_parts = []
if not path:
return path_parts
path_root, path_separator = _GetPathRootAndSeparatorFromDataType(data_type)
if path == path_root:
return path_parts
if data_type == 'components':
index = 0
index = 2 # Skip the leading '//' in the path.
while index >= 0:
next_index = path.find(path_separator, index)
if next_index >= 0:
name = path[index:next_index + 1]
if data_type == 'components':
sub_path = path[:next_index]
sub_path = path[:next_index + 1]
next_index += 1
name = path[index:]
sub_path = path
path_parts.append([name, sub_path])
index = next_index
return path_parts
class ServeCodeCoverageData(BaseHandler):
def HandleGet(self):
host = self.request.get('host', '')
project = self.request.get('project', 'chromium/src')
change = self.request.get('change')
patchset = self.request.get('patchset')
revision = self.request.get('revision')
path = self.request.get('path')
data_type = self.request.get('data_type')
if not data_type and path:
if path.endswith('/'):
data_type = 'dirs'
elif path and '>' in path:
data_type = 'components'
data_type = 'files''host=%s', host)'project=%s', project)'change=%s', change)'patchset=%s', patchset)'revision=%s', revision)'data_type=%s', data_type)'path=%s', path)
if change and patchset:'Servicing coverage data for presubmit')
if project not in _PROJECTS_WHITELIST:
kwargs = {'is_project_supported': False}
return BaseHandler.CreateError(
error_message='Project "%s" is not supported.' % project,
if not _IsServePresubmitCoverageDataEnabled():
# TODO( Switch to 'is_service_enabled'.
kwargs = {'is_project_supported': False}
return BaseHandler.CreateError(
error_message=('The functionality has been temporarity disabled.'),
code_revision_index = '%s-%s' % (change, patchset)
entity = CoverageData.Get(host, code_revision_index, 'patch', 'ALL')
if not entity:
return BaseHandler.CreateError(
'Requested coverage data is not found.', 404, allowed_origin='*')
data =
formatted_data = {'files': []}
for file_data in data:
path = file_data['path']
if path.startswith('//'): # Check for safe. Old data don't have '//'.
path = path[2:]
'path': path,
'lines': _DecompressLines(file_data['lines']),
return {
'data': {
'host': host,
'project': project,
'change': change,
'patchset': patchset,
'data': formatted_data,
'allowed_origin': '*'
elif project:'Servicing coverage data for postsubmit')
template = None
if not revision:
query = PostsubmitReport.query(
PostsubmitReport.server_host == host, PostsubmitReport.project ==
entities, _, _ = query.fetch_page(100)
data = [e._to_dict() for e in entities]
template = 'coverage/codebase_view.html'
data_type = 'codebase'
key = ndb.Key(PostsubmitReport, '%s$%s$%s' % (host, project, revision))
report = key.get()
if not report:
return BaseHandler.CreateError('Report record not found', 404)
template = 'coverage/summary_view.html'
if data_type == 'dirs':
path = path or '//'
elif data_type == 'components':
path = path or '>>'
template = 'coverage/file_view.html'
assert data_type, 'Unknown data_type'
code_revision_index = '%s-%s' % (project, revision)
entity = CoverageData.Get(host, code_revision_index, data_type, path)
if not entity and path.startswith('//'):
# For legacy data, the path in datastore doesn't start with '//'.
path = path[2:]
entity = CoverageData.Get(host, code_revision_index, data_type, path)
metadata = if entity else None
data = {
'commit_position': report.commit_position,
'metadata': metadata,
line_to_data = None
if data_type == 'files':
line_to_data = collections.defaultdict(dict)
if path.startswith('//'):
path = path[2:]
file_content = _GetFileContentFromGS(report, 'coverage/%s' % path)
if not file_content:
line_to_data[1]['line'] = '!!!!No source code available!!!!'
line_to_data[1]['count'] = 0
file_lines = file_content.splitlines()
for i, line in enumerate(file_lines):
line_to_data[i + 1]['line'] = line
line_to_data[i + 1]['count'] = -1
for line in metadata['lines']:
for i in range(line['first'], line['last'] + 1):
line_to_data[i]['count'] = line['count']
line_to_data = list(line_to_data.iteritems())
line_to_data.sort(key=lambda x: x[0])
data['line_to_data'] = line_to_data
if not path.startswith('//'):
path = '//' + path
# Compute the mapping of the name->path mappings in order.
path_parts = _GetNameToPathSeparator(path, data_type)
path_root, _ = _GetPathRootAndSeparatorFromDataType(data_type)
return {
'data': {
'host': host,
'project': project,
'revision': revision,
'path': path,
'path_root': path_root,
'data': data,
'data_type': data_type,
'path_parts': path_parts,
'template': template,
return BaseHandler.CreateError('Invalid request', 400)
class GetCoverageFile(BaseHandler):
def HandleGet(self):'Processing %s', self.request.path)
key = None
file_path = None
match = _URL_REGEX_POSTSUBMIT.match(self.request.path)
if match:
host, project, revision, file_path = match.groups()
host = host or ''
project = project or 'chromium/src'
key = ndb.Key(PostsubmitReport, '%s$%s$%s' % (host, project, revision))
match = _URL_REGEX_PRESUBMIT.match(self.request.path)
if match:
host, change, patchset, build_id = match.groups()
host = host or ''
key = ndb.Key(PresubmitReport,
'%s$%s$%s$%s' % (host, change, patchset, build_id))
if not key or not file_path:
return BaseHandler.CreateError('Report record not found', 404)
report = key.get()
if not report:
return BaseHandler.CreateError('Report record not found', 404)
content_type = 'text/plain'
for file_type in ('html', 'css'):
if file_path.endswith('.' + file_type):
content_type = 'text/%s' % file_type
data = _GetFileContentFromGS(report, file_path)
return_code = 200 if data is not None else 404
return {
'data': data,
'return_code': return_code,
'content_type': content_type,