lib/stage_feature_extractor.py - chromiumos/infra/bad_cl_detector - Git at Google

 # Copyright 2017 The Chromium OS Authors. All rights reserved.
 # Use of this source code is governed by a BSD-style license that can be
 # found in the LICENSE file.
 """Script to extract stage features from raw protobufs.

 Function structure outline:
   ExtractStageFeatures():
   ---> _GetInOutDir()
   ---> _GetFilenames()
   --->  _ExtractStageFeaturesHelper()
     --->_ExtractStageFeaturesForSingleCQ()
       ---> _UpdateStageFeaturesDictForBuild()
         ---> _ExtractStageInfoFeatures()

 How to use:
   just call ExtractStageFeatures(for_training) to generate csv from protobufs.
 """
 from __future__ import print_function

 import os

 from chromite.lib import cros_logging as logging
 import pandas

 from lib import constants
 from lib import feature_extractor_utils
 from lib.protos import StagesForCQ_pb2


 STAGES = ['MasterSlaveSyncCompletion', 'Paygen', 'CPEExport',
           'HWTest [bvt-inline]', 'UnitTest', 'Archive', 'SimpleChromeWorkflow',
           'SlaveFailureSummary', 'VMTest (attempt 1)', 'BuildPackages',
           'AFDODataGenerate', 'Signing', 'DetectRelevantChanges',
           'UploadPrebuilts', 'SetupBoard', 'TestSimpleChromeWorkflow',
           'AndroidMetadata', 'Uprev', 'SignerTest', 'BranchUtilTest',
           'SetupBoard (pre-Patch)', 'InitSDK', 'InitSDK (pre-Patch)',
           'AFDOUpdateEbuild', 'PatchChrome', 'GCETest (attempt 2)', 'CleanUp',
           'DevInstallerPrebuilts', 'HWTest [moblab_quick]', 'BinhostTest',
           'Report', 'HWTest [jetstream_cq]', 'SyncChrome', 'ConfigDump',
           'DetectIrrelevantChanges', 'DownloadAndroidDebugSymbols',
           'UprevAndroid', 'MasterSlaveLKGMSync', 'VMTest (attempt 2)',
           'HWTest [bvt-arc]', 'BuildReexecutionFinished', 'BuildStart',
           'ChromeSDK', 'UploadTestArtifacts', 'HWTest [AFDO_record]',
           'BuildImage', 'ImageTest', 'HWTest [sanity]', 'PublishUprevChanges',
           'ManifestVersionedSync', 'CommitQueueCompletion', 'DebugSymbols',
           'MasterUploadPrebuilts', 'SimpleChromeArtifacts', 'CommitQueueSync',
           'GCETest (attempt 1)', 'HWTest [bvt-cq]', 'HWTest [arc-bvt-cq]',
           'ScheduleSlaves', 'RegenPortageCache', 'BuildPackages (pre-Patch)']
 STAGES_FOR_BUILD_KEYS = STAGES + ['duration', 'important', 'final',
                                   'UnknownStage']
 STAGE_INFO_KEYS = ['build_id', 'status', 'duration', 'board', 'stage_id',
                    'builder_config', 'stage_name', 'important', 'final']


 def ExtractStageFeatures(for_training, bucket=None):
   """Extract features from stage records and store the features in csv file(s).

   Args:
     for_training: boolean, True for training data, False for prediction.
     bucket: Google Cloud Storage bucket. If not None, it means running on app
             engine. Default to None.
   """
   (input_dir,
    output_dir) = feature_extractor_utils.GetInOutDir(
        for_training,
        constants.FEATURE_TYPE_STAGE)
   stage_filenames = feature_extractor_utils.GetFilenames(input_dir, bucket)
   _ExtractStageFeaturesHelper(stage_filenames, output_dir, for_training, bucket)


 def _ExtractStageFeaturesHelper(stage_filenames, output_dir, for_training,
                                 bucket):
   """Extract features from stage records defined in stage_filenames.

   For training, save processed data in constants.STAGE_TRAIN_FILE.
   For prediction, save processed data in individual masterID_count.csv file.

   Args:
     stage_filenames: a list of filenames with path info to load.
     output_dir: output directory to store the resultant csv file.
     for_training: boolean, True for training data, False for prediction.
     bucket: Google Cloud Storage bucket. If not None, it means running on app
             engine.
   """
   data_table = []

   for filename in stage_filenames:
     stages_for_cq_msg = _GetStagesForCQMessage(filename, bucket)
     stages_for_cq_features = _ExtractStageFeaturesForSingleCQ(stages_for_cq_msg)
     # need to store each individual master build result if for prediction.
     if not for_training:
       df = pandas.DataFrame(stages_for_cq_features)
       build_filename = os.path.basename(filename) + constants.CSV_SUFFIX
       df.to_csv(os.path.join(output_dir, build_filename), encoding='utf-8')
     else:
       data_table.extend(stages_for_cq_features)

   if for_training:
     df = pandas.DataFrame(data_table)
     df.to_csv(os.path.join(output_dir, constants.STAGE_TRAIN_FILE),
               encoding='utf-8')


 def _ExtractStageFeaturesForSingleCQ(stages_for_cq_msg):
   """Extract Stage features for one CQ run.

   Args:
     stages_for_cq_msg: a StagesForCQ message defined in StagesForCQ.proto. It
                        contains all stages info in a master build and all its
                        slave builds.

   Returns:
     A list of dictionaries containing the information in the stages_for_cq_msg.
     Each dictionary item represents one build, which could be mater or slave.
     Valid keys defined in STAGES_FOR_BUILD_KEYS.
   """
   logging.info('Processing CQ %d' % stages_for_cq_msg.master_build_id)

   # a dictionary to store stage info in present CQ. Key is build id, could
   # be master or slave. Value is a dictionary of stage features of a single
   # build with keys defined in STAGES_FOR_BUILD_KEYS.
   stages_for_cq_dict = {}

   for stage_info_msg in stages_for_cq_msg.stage_msgs:
     # only record failed stages
     if stage_info_msg.status == constants.STATUS_FAIL:
       if stage_info_msg.build_id not in stages_for_cq_dict:
         stages_for_cq_dict[stage_info_msg.build_id] = dict.fromkeys(
             STAGES_FOR_BUILD_KEYS, 0)
         stages_for_cq_dict[stage_info_msg.build_id]['failure_category'] = (
             stages_for_cq_msg.failure_category)

       _UpdateStageFeaturesDictForBuild(
           stage_info_msg, stages_for_cq_dict[stage_info_msg.build_id])

   # return a list of dictionaries of the result for each build
   return [value for _, value in stages_for_cq_dict.iteritems()]


 def _UpdateStageFeaturesDictForBuild(stage_info_msg, current_build_dict):
   """Process a stage message. Update current build result dictionary in place.

   Args:
     stage_info_msg: a StageInfo message as defined in StagesForCQ.proto.
     current_build_dict: a dictionary to store stage info in current build.
                         Valid keys are defined in STAGES_FOR_BUILD_KEYS.
   """
   stage_result = _ExtractStageInfoFeatures(stage_info_msg)

   for key, value in stage_result.iteritems():
     if key == 'stage_name':
       try:
         current_build_dict[value] += 1  # value is actual failed stage name
       except KeyError:
         # This is ok. It may happen if new test stages are added or a rare
         # test is being executed. Update 'UnknowStage' field in this case.
         logging.info('unknown stage encountered: %s' % value)
         current_build_dict['UnknownStage'] += 1

     elif key == 'duration':
       current_build_dict[key] += value
     elif key in ('important', 'final'):
       # if any failed stage has 'important' or 'final' equal to True, then we
       # set the corresponding result in current build dict to constants.TRUE.
       if value:
         current_build_dict[key] = constants.TRUE


 def _GetStagesForCQMessage(filename, bucket):
   """Read a persisted protobuf file on disk and return a StagesForCQ object.

   Args:
     filename: full file name with path information. It takes the form:
               path/to/id_occurrence. The file is the persisted protobuf record.
     bucket: Google Cloud Storage bucket. If not None, it means running on app
             engine.

   Returns:
     a StagesForCQ message object.
   """
   stages_for_cq_msg = StagesForCQ_pb2.StagesForCQ()
   if bucket:
     file_to_load = bucket.get_blob(filename).download_as_string()
     stages_for_cq_msg.ParseFromString(file_to_load)
   else:
     with open(filename, 'rb') as file_to_load:
       stages_for_cq_msg.ParseFromString(file_to_load.read())

   return stages_for_cq_msg


 def _ExtractStageInfoFeatures(stage_info_msg):
   """Extract CL stage features from a StageInfo message.

   Args:
     stage_info_msg: a StageInfo message as defined in StagesForCQ.proto.

   Returns:
     A dictionary containing features from stage_info_msg, with valid keys of
     STAGE_INFO_KEYS.
   """
   stage_info_dict = {}

   for descriptor in stage_info_msg.DESCRIPTOR.fields:
     stage_info_dict[descriptor.name] = getattr(stage_info_msg, descriptor.name)

   return stage_info_dict
	# Copyright 2017 The Chromium OS Authors. All rights reserved.
	# Use of this source code is governed by a BSD-style license that can be
	# found in the LICENSE file.
	"""Script to extract stage features from raw protobufs.

	Function structure outline:
	ExtractStageFeatures():
	---> _GetInOutDir()
	---> _GetFilenames()
	---> _ExtractStageFeaturesHelper()
	--->_ExtractStageFeaturesForSingleCQ()
	---> _UpdateStageFeaturesDictForBuild()
	---> _ExtractStageInfoFeatures()

	How to use:
	just call ExtractStageFeatures(for_training) to generate csv from protobufs.
	"""
	from __future__ import print_function

	import os

	from chromite.lib import cros_logging as logging
	import pandas

	from lib import constants
	from lib import feature_extractor_utils
	from lib.protos import StagesForCQ_pb2


	STAGES = ['MasterSlaveSyncCompletion', 'Paygen', 'CPEExport',
	'HWTest [bvt-inline]', 'UnitTest', 'Archive', 'SimpleChromeWorkflow',
	'SlaveFailureSummary', 'VMTest (attempt 1)', 'BuildPackages',
	'AFDODataGenerate', 'Signing', 'DetectRelevantChanges',
	'UploadPrebuilts', 'SetupBoard', 'TestSimpleChromeWorkflow',
	'AndroidMetadata', 'Uprev', 'SignerTest', 'BranchUtilTest',
	'SetupBoard (pre-Patch)', 'InitSDK', 'InitSDK (pre-Patch)',
	'AFDOUpdateEbuild', 'PatchChrome', 'GCETest (attempt 2)', 'CleanUp',
	'DevInstallerPrebuilts', 'HWTest [moblab_quick]', 'BinhostTest',
	'Report', 'HWTest [jetstream_cq]', 'SyncChrome', 'ConfigDump',
	'DetectIrrelevantChanges', 'DownloadAndroidDebugSymbols',
	'UprevAndroid', 'MasterSlaveLKGMSync', 'VMTest (attempt 2)',
	'HWTest [bvt-arc]', 'BuildReexecutionFinished', 'BuildStart',
	'ChromeSDK', 'UploadTestArtifacts', 'HWTest [AFDO_record]',
	'BuildImage', 'ImageTest', 'HWTest [sanity]', 'PublishUprevChanges',
	'ManifestVersionedSync', 'CommitQueueCompletion', 'DebugSymbols',
	'MasterUploadPrebuilts', 'SimpleChromeArtifacts', 'CommitQueueSync',
	'GCETest (attempt 1)', 'HWTest [bvt-cq]', 'HWTest [arc-bvt-cq]',
	'ScheduleSlaves', 'RegenPortageCache', 'BuildPackages (pre-Patch)']
	STAGES_FOR_BUILD_KEYS = STAGES + ['duration', 'important', 'final',
	'UnknownStage']
	STAGE_INFO_KEYS = ['build_id', 'status', 'duration', 'board', 'stage_id',
	'builder_config', 'stage_name', 'important', 'final']


	def ExtractStageFeatures(for_training, bucket=None):
	"""Extract features from stage records and store the features in csv file(s).

	Args:
	for_training: boolean, True for training data, False for prediction.
	bucket: Google Cloud Storage bucket. If not None, it means running on app
	engine. Default to None.
	"""
	(input_dir,
	output_dir) = feature_extractor_utils.GetInOutDir(
	for_training,
	constants.FEATURE_TYPE_STAGE)
	stage_filenames = feature_extractor_utils.GetFilenames(input_dir, bucket)
	_ExtractStageFeaturesHelper(stage_filenames, output_dir, for_training, bucket)


	def _ExtractStageFeaturesHelper(stage_filenames, output_dir, for_training,
	bucket):
	"""Extract features from stage records defined in stage_filenames.

	For training, save processed data in constants.STAGE_TRAIN_FILE.
	For prediction, save processed data in individual masterID_count.csv file.

	Args:
	stage_filenames: a list of filenames with path info to load.
	output_dir: output directory to store the resultant csv file.
	for_training: boolean, True for training data, False for prediction.
	bucket: Google Cloud Storage bucket. If not None, it means running on app
	engine.
	"""
	data_table = []

	for filename in stage_filenames:
	stages_for_cq_msg = _GetStagesForCQMessage(filename, bucket)
	stages_for_cq_features = _ExtractStageFeaturesForSingleCQ(stages_for_cq_msg)
	# need to store each individual master build result if for prediction.
	if not for_training:
	df = pandas.DataFrame(stages_for_cq_features)
	build_filename = os.path.basename(filename) + constants.CSV_SUFFIX
	df.to_csv(os.path.join(output_dir, build_filename), encoding='utf-8')
	else:
	data_table.extend(stages_for_cq_features)

	if for_training:
	df = pandas.DataFrame(data_table)
	df.to_csv(os.path.join(output_dir, constants.STAGE_TRAIN_FILE),
	encoding='utf-8')


	def _ExtractStageFeaturesForSingleCQ(stages_for_cq_msg):
	"""Extract Stage features for one CQ run.

	Args:
	stages_for_cq_msg: a StagesForCQ message defined in StagesForCQ.proto. It
	contains all stages info in a master build and all its
	slave builds.

	Returns:
	A list of dictionaries containing the information in the stages_for_cq_msg.
	Each dictionary item represents one build, which could be mater or slave.
	Valid keys defined in STAGES_FOR_BUILD_KEYS.
	"""
	logging.info('Processing CQ %d' % stages_for_cq_msg.master_build_id)

	# a dictionary to store stage info in present CQ. Key is build id, could
	# be master or slave. Value is a dictionary of stage features of a single
	# build with keys defined in STAGES_FOR_BUILD_KEYS.
	stages_for_cq_dict = {}

	for stage_info_msg in stages_for_cq_msg.stage_msgs:
	# only record failed stages
	if stage_info_msg.status == constants.STATUS_FAIL:
	if stage_info_msg.build_id not in stages_for_cq_dict:
	stages_for_cq_dict[stage_info_msg.build_id] = dict.fromkeys(
	STAGES_FOR_BUILD_KEYS, 0)
	stages_for_cq_dict[stage_info_msg.build_id]['failure_category'] = (
	stages_for_cq_msg.failure_category)

	_UpdateStageFeaturesDictForBuild(
	stage_info_msg, stages_for_cq_dict[stage_info_msg.build_id])

	# return a list of dictionaries of the result for each build
	return [value for _, value in stages_for_cq_dict.iteritems()]


	def _UpdateStageFeaturesDictForBuild(stage_info_msg, current_build_dict):
	"""Process a stage message. Update current build result dictionary in place.

	Args:
	stage_info_msg: a StageInfo message as defined in StagesForCQ.proto.
	current_build_dict: a dictionary to store stage info in current build.
	Valid keys are defined in STAGES_FOR_BUILD_KEYS.
	"""
	stage_result = _ExtractStageInfoFeatures(stage_info_msg)

	for key, value in stage_result.iteritems():
	if key == 'stage_name':
	try:
	current_build_dict[value] += 1 # value is actual failed stage name
	except KeyError:
	# This is ok. It may happen if new test stages are added or a rare
	# test is being executed. Update 'UnknowStage' field in this case.
	logging.info('unknown stage encountered: %s' % value)
	current_build_dict['UnknownStage'] += 1

	elif key == 'duration':
	current_build_dict[key] += value
	elif key in ('important', 'final'):
	# if any failed stage has 'important' or 'final' equal to True, then we
	# set the corresponding result in current build dict to constants.TRUE.
	if value:
	current_build_dict[key] = constants.TRUE


	def _GetStagesForCQMessage(filename, bucket):
	"""Read a persisted protobuf file on disk and return a StagesForCQ object.

	Args:
	filename: full file name with path information. It takes the form:
	path/to/id_occurrence. The file is the persisted protobuf record.
	bucket: Google Cloud Storage bucket. If not None, it means running on app
	engine.

	Returns:
	a StagesForCQ message object.
	"""
	stages_for_cq_msg = StagesForCQ_pb2.StagesForCQ()
	if bucket:
	file_to_load = bucket.get_blob(filename).download_as_string()
	stages_for_cq_msg.ParseFromString(file_to_load)
	else:
	with open(filename, 'rb') as file_to_load:
	stages_for_cq_msg.ParseFromString(file_to_load.read())

	return stages_for_cq_msg


	def _ExtractStageInfoFeatures(stage_info_msg):
	"""Extract CL stage features from a StageInfo message.

	Args:
	stage_info_msg: a StageInfo message as defined in StagesForCQ.proto.

	Returns:
	A dictionary containing features from stage_info_msg, with valid keys of
	STAGE_INFO_KEYS.
	"""
	stage_info_dict = {}

	for descriptor in stage_info_msg.DESCRIPTOR.fields:
	stage_info_dict[descriptor.name] = getattr(stage_info_msg, descriptor.name)

	return stage_info_dict