| # Copyright 2017 The Chromium OS Authors. All rights reserved. |
| # Use of this source code is governed by a BSD-style license that can be |
| # found in the LICENSE file. |
| """Script to extract stage features from raw protobufs. |
| |
| Function structure outline: |
| ExtractStageFeatures(): |
| ---> _GetInOutDir() |
| ---> _GetFilenames() |
| ---> _ExtractStageFeaturesHelper() |
| --->_ExtractStageFeaturesForSingleCQ() |
| ---> _UpdateStageFeaturesDictForBuild() |
| ---> _ExtractStageInfoFeatures() |
| |
| How to use: |
| just call ExtractStageFeatures(for_training) to generate csv from protobufs. |
| """ |
| from __future__ import print_function |
| |
| import os |
| |
| from chromite.lib import cros_logging as logging |
| import pandas |
| |
| from lib import constants |
| from lib import feature_extractor_utils |
| from lib.protos import StagesForCQ_pb2 |
| |
| |
| STAGES = ['MasterSlaveSyncCompletion', 'Paygen', 'CPEExport', |
| 'HWTest [bvt-inline]', 'UnitTest', 'Archive', 'SimpleChromeWorkflow', |
| 'SlaveFailureSummary', 'VMTest (attempt 1)', 'BuildPackages', |
| 'AFDODataGenerate', 'Signing', 'DetectRelevantChanges', |
| 'UploadPrebuilts', 'SetupBoard', 'TestSimpleChromeWorkflow', |
| 'AndroidMetadata', 'Uprev', 'SignerTest', 'BranchUtilTest', |
| 'SetupBoard (pre-Patch)', 'InitSDK', 'InitSDK (pre-Patch)', |
| 'AFDOUpdateEbuild', 'PatchChrome', 'GCETest (attempt 2)', 'CleanUp', |
| 'DevInstallerPrebuilts', 'HWTest [moblab_quick]', 'BinhostTest', |
| 'Report', 'HWTest [jetstream_cq]', 'SyncChrome', 'ConfigDump', |
| 'DetectIrrelevantChanges', 'DownloadAndroidDebugSymbols', |
| 'UprevAndroid', 'MasterSlaveLKGMSync', 'VMTest (attempt 2)', |
| 'HWTest [bvt-arc]', 'BuildReexecutionFinished', 'BuildStart', |
| 'ChromeSDK', 'UploadTestArtifacts', 'HWTest [AFDO_record]', |
| 'BuildImage', 'ImageTest', 'HWTest [sanity]', 'PublishUprevChanges', |
| 'ManifestVersionedSync', 'CommitQueueCompletion', 'DebugSymbols', |
| 'MasterUploadPrebuilts', 'SimpleChromeArtifacts', 'CommitQueueSync', |
| 'GCETest (attempt 1)', 'HWTest [bvt-cq]', 'HWTest [arc-bvt-cq]', |
| 'ScheduleSlaves', 'RegenPortageCache', 'BuildPackages (pre-Patch)'] |
| STAGES_FOR_BUILD_KEYS = STAGES + ['duration', 'important', 'final', |
| 'UnknownStage'] |
| STAGE_INFO_KEYS = ['build_id', 'status', 'duration', 'board', 'stage_id', |
| 'builder_config', 'stage_name', 'important', 'final'] |
| |
| |
| def ExtractStageFeatures(for_training, bucket=None): |
| """Extract features from stage records and store the features in csv file(s). |
| |
| Args: |
| for_training: boolean, True for training data, False for prediction. |
| bucket: Google Cloud Storage bucket. If not None, it means running on app |
| engine. Default to None. |
| """ |
| (input_dir, |
| output_dir) = feature_extractor_utils.GetInOutDir( |
| for_training, |
| constants.FEATURE_TYPE_STAGE) |
| stage_filenames = feature_extractor_utils.GetFilenames(input_dir, bucket) |
| _ExtractStageFeaturesHelper(stage_filenames, output_dir, for_training, bucket) |
| |
| |
| def _ExtractStageFeaturesHelper(stage_filenames, output_dir, for_training, |
| bucket): |
| """Extract features from stage records defined in stage_filenames. |
| |
| For training, save processed data in constants.STAGE_TRAIN_FILE. |
| For prediction, save processed data in individual masterID_count.csv file. |
| |
| Args: |
| stage_filenames: a list of filenames with path info to load. |
| output_dir: output directory to store the resultant csv file. |
| for_training: boolean, True for training data, False for prediction. |
| bucket: Google Cloud Storage bucket. If not None, it means running on app |
| engine. |
| """ |
| data_table = [] |
| |
| for filename in stage_filenames: |
| stages_for_cq_msg = _GetStagesForCQMessage(filename, bucket) |
| stages_for_cq_features = _ExtractStageFeaturesForSingleCQ(stages_for_cq_msg) |
| # need to store each individual master build result if for prediction. |
| if not for_training: |
| df = pandas.DataFrame(stages_for_cq_features) |
| build_filename = os.path.basename(filename) + constants.CSV_SUFFIX |
| df.to_csv(os.path.join(output_dir, build_filename), encoding='utf-8') |
| else: |
| data_table.extend(stages_for_cq_features) |
| |
| if for_training: |
| df = pandas.DataFrame(data_table) |
| df.to_csv(os.path.join(output_dir, constants.STAGE_TRAIN_FILE), |
| encoding='utf-8') |
| |
| |
| def _ExtractStageFeaturesForSingleCQ(stages_for_cq_msg): |
| """Extract Stage features for one CQ run. |
| |
| Args: |
| stages_for_cq_msg: a StagesForCQ message defined in StagesForCQ.proto. It |
| contains all stages info in a master build and all its |
| slave builds. |
| |
| Returns: |
| A list of dictionaries containing the information in the stages_for_cq_msg. |
| Each dictionary item represents one build, which could be mater or slave. |
| Valid keys defined in STAGES_FOR_BUILD_KEYS. |
| """ |
| logging.info('Processing CQ %d' % stages_for_cq_msg.master_build_id) |
| |
| # a dictionary to store stage info in present CQ. Key is build id, could |
| # be master or slave. Value is a dictionary of stage features of a single |
| # build with keys defined in STAGES_FOR_BUILD_KEYS. |
| stages_for_cq_dict = {} |
| |
| for stage_info_msg in stages_for_cq_msg.stage_msgs: |
| # only record failed stages |
| if stage_info_msg.status == constants.STATUS_FAIL: |
| if stage_info_msg.build_id not in stages_for_cq_dict: |
| stages_for_cq_dict[stage_info_msg.build_id] = dict.fromkeys( |
| STAGES_FOR_BUILD_KEYS, 0) |
| stages_for_cq_dict[stage_info_msg.build_id]['failure_category'] = ( |
| stages_for_cq_msg.failure_category) |
| |
| _UpdateStageFeaturesDictForBuild( |
| stage_info_msg, stages_for_cq_dict[stage_info_msg.build_id]) |
| |
| # return a list of dictionaries of the result for each build |
| return [value for _, value in stages_for_cq_dict.iteritems()] |
| |
| |
| def _UpdateStageFeaturesDictForBuild(stage_info_msg, current_build_dict): |
| """Process a stage message. Update current build result dictionary in place. |
| |
| Args: |
| stage_info_msg: a StageInfo message as defined in StagesForCQ.proto. |
| current_build_dict: a dictionary to store stage info in current build. |
| Valid keys are defined in STAGES_FOR_BUILD_KEYS. |
| """ |
| stage_result = _ExtractStageInfoFeatures(stage_info_msg) |
| |
| for key, value in stage_result.iteritems(): |
| if key == 'stage_name': |
| try: |
| current_build_dict[value] += 1 # value is actual failed stage name |
| except KeyError: |
| # This is ok. It may happen if new test stages are added or a rare |
| # test is being executed. Update 'UnknowStage' field in this case. |
| logging.info('unknown stage encountered: %s' % value) |
| current_build_dict['UnknownStage'] += 1 |
| |
| elif key == 'duration': |
| current_build_dict[key] += value |
| elif key in ('important', 'final'): |
| # if any failed stage has 'important' or 'final' equal to True, then we |
| # set the corresponding result in current build dict to constants.TRUE. |
| if value: |
| current_build_dict[key] = constants.TRUE |
| |
| |
| def _GetStagesForCQMessage(filename, bucket): |
| """Read a persisted protobuf file on disk and return a StagesForCQ object. |
| |
| Args: |
| filename: full file name with path information. It takes the form: |
| path/to/id_occurrence. The file is the persisted protobuf record. |
| bucket: Google Cloud Storage bucket. If not None, it means running on app |
| engine. |
| |
| Returns: |
| a StagesForCQ message object. |
| """ |
| stages_for_cq_msg = StagesForCQ_pb2.StagesForCQ() |
| if bucket: |
| file_to_load = bucket.get_blob(filename).download_as_string() |
| stages_for_cq_msg.ParseFromString(file_to_load) |
| else: |
| with open(filename, 'rb') as file_to_load: |
| stages_for_cq_msg.ParseFromString(file_to_load.read()) |
| |
| return stages_for_cq_msg |
| |
| |
| def _ExtractStageInfoFeatures(stage_info_msg): |
| """Extract CL stage features from a StageInfo message. |
| |
| Args: |
| stage_info_msg: a StageInfo message as defined in StagesForCQ.proto. |
| |
| Returns: |
| A dictionary containing features from stage_info_msg, with valid keys of |
| STAGE_INFO_KEYS. |
| """ |
| stage_info_dict = {} |
| |
| for descriptor in stage_info_msg.DESCRIPTOR.fields: |
| stage_info_dict[descriptor.name] = getattr(stage_info_msg, descriptor.name) |
| |
| return stage_info_dict |