blob: 87ce8b8a98c3a41954afc07f3d3fe8a8e642c549 [file] [log] [blame]
#!/usr/bin/env python2
#
# Copyright 2017 The Chromium OS Authors. All rights reserved.
# Use of this source code is governed by a BSD-style license that can be
# found in the LICENSE file.
"""
'get_attachments' is a tool to find Testlog attachment files from
BigQuery / GCS across a given date range, and to save them to disk.
"""
from __future__ import print_function
import argparse
import hashlib
import json
import os
import shutil
import subprocess
import sys
from six.moves import xrange
DATE_FORMAT = '%Y%m%d%H%M%S'
HASH_FILE_READ_BLOCK_SIZE = 1024 * 64 # 64kb
GSUTIL_TEST_PERMISSION_PATH = 'gs://chromeos-localmirror-private/testing/'
PROJECT_ID = 'chromeos-factory'
DEFAULT_START_TIME = '1970-01-01'
DEFAULT_END_TIME = '2100-12-31'
DEFAULT_TARGET_DIR = 'factory_attachments'
DEFAULT_BQ_PATH = 'bq'
DEFAULT_GSUTIL_PATH = 'gsutil'
DEFAULT_SERIAL_NUMBER_KEY = 'serial_number'
def CheckVersion(args):
print('Checking your gsutil version...')
subprocess.check_call([args.gsutil_path, '--version'])
print('Checking your Storage permission...')
subprocess.check_call([args.gsutil_path, 'ls',
'gs://%s/%s' % (PROJECT_ID, args.dataset_id)])
print('Checking your bq version...')
subprocess.check_call([args.bq_path, 'version'])
print('Checking your BigQuery permission and BigQuery dataset ID...')
subprocess.check_call([args.bq_path, 'show',
'--project_id', PROJECT_ID,
'--dataset_id', args.dataset_id])
print()
def RunQuery(args):
# We don't use parameterized query because the web UI doesn't support it.
query_statement_words = [
'SELECT',
' attachment.path AS remote,',
' FORMAT_TIMESTAMP("%s", time) AS server_time,',
' attachment.key AS attachment_key,',
' (',
' SELECT serialNumber.value',
' FROM UNNEST(data.serialNumbers) AS serialNumber',
' WHERE serialNumber.key="%s"',
' ) AS serial_number',
'FROM',
' `%s.%s.testlog_events` AS data,',
' UNNEST(data.attachments) AS attachment',
'WHERE',
' attachment.key LIKE "%%%s%%" AND',
' time >= TIMESTAMP("%s") AND',
' time < TIMESTAMP("%s")']
query_statement = '\n'.join(query_statement_words)
query = query_statement % (
DATE_FORMAT, args.serial_number_key, PROJECT_ID, args.dataset_id,
args.attachment_key, args.start_date, args.end_date)
print('Execute the query, you can also copy-paste the following query to\n'
'https://bigquery.cloud.google.com')
print('-' * 37 + 'QUERY' + '-' * 38 + '\n' + query + '\n' + '-' * 80 + '\n')
p = subprocess.Popen(
[args.bq_path, 'query', '--max_rows', '1000000', '--nouse_legacy_sql',
'--format', 'json'],
stdin=subprocess.PIPE,
stdout=subprocess.PIPE)
result_json, stderrdata = p.communicate(query)
if p.returncode:
print('Query Failed!')
print(stderrdata)
sys.exit(1)
print()
return result_json
def Filter(args, results):
return [row
for row in results
if row['serial_number'] in args.serial_number]
def Download(args, results):
tmp_dir = os.path.join(args.target_dir, 'tmp')
remote_list = []
for row in results:
row['tmp'] = os.path.join(tmp_dir, row['remote'].split('/')[-1])
remote_list.append(row['remote'])
# Remove duplicates in the remote_list.
remote_list = list(set(remote_list))
for i in xrange(0, len(remote_list), 100):
print('Downloading the %d-%d of %s attachments...' %
(i, min(i + 100, len(remote_list)) - 1, len(remote_list)))
commands = [args.gsutil_path, '-m', 'cp', '-n']
commands.extend(remote_list[i:i+100])
commands.append(tmp_dir)
subprocess.check_call(commands)
def FileHash(path):
file_hash = hashlib.md5()
with open(path, 'rb') as f:
for chunk in iter(lambda: f.read(HASH_FILE_READ_BLOCK_SIZE), ''):
file_hash.update(chunk)
return file_hash.hexdigest()
def CopyAndDelete(args, results):
tmp_dir = os.path.join(args.target_dir, 'tmp')
for row in results:
file_name = '%s_%s_%s_%s' % (row['server_time'], row['attachment_key'],
row['serial_number'] or 'NoSerialNumber',
FileHash(row['tmp']))
local = os.path.join(args.target_dir, file_name)
print(row['remote'] + ' --> ' + local)
shutil.copyfile(row['tmp'], local)
shutil.rmtree(tmp_dir)
def main():
parser = argparse.ArgumentParser(
formatter_class=argparse.RawDescriptionHelpFormatter,
description='Attachment downloader script',
epilog='Common errors are permission and installation.\n'
'Please check https://cloud.google.com/sdk/docs/ to\n'
' (1) update your gsutil to 4.26 and BigQuery CLI to 2.0.24\n'
' (2) authorize gcloud by "gcloud init" or "gcloud auth login"')
parser.add_argument(
'dataset_id',
help='The BigQuery dataset ID. Example: a01')
parser.add_argument(
'attachment_key',
help='The attachment key. Example: TESTID')
parser.add_argument(
'--start_date', '-s', default=DEFAULT_START_TIME,
help='The start of date. Default: %s' % DEFAULT_START_TIME)
parser.add_argument(
'--end_date', '-e', default=DEFAULT_END_TIME,
help='The end of date. Default: %s' % DEFAULT_END_TIME)
parser.add_argument(
'--target_dir', '-t', default=DEFAULT_TARGET_DIR,
help='The target directory. Default: %s' % DEFAULT_TARGET_DIR)
parser.add_argument(
'--bq_path', '-b', default=DEFAULT_BQ_PATH,
help='The bq path. Default: %s' % DEFAULT_BQ_PATH)
parser.add_argument(
'--gsutil_path', '-g', default=DEFAULT_GSUTIL_PATH,
help='The gsutil path. Default: %s' % DEFAULT_GSUTIL_PATH)
parser.add_argument(
'--serial_number_key', '-sn_key', default=DEFAULT_SERIAL_NUMBER_KEY,
help='The key of the serial number to put in the file name. '
'Default: %s' % DEFAULT_SERIAL_NUMBER_KEY)
parser.add_argument(
'--serial_number', '-sn', type=str, action='append',
help='The value of the serial number to download. This can be added '
'multiple times. This feature is implemented by this script '
'instead of query.')
args = parser.parse_args()
CheckVersion(args)
result_json = RunQuery(args).strip()
results = json.loads(result_json)
if args.serial_number:
results = Filter(args, results)
if not results:
print('Query returned zero records.\n'
'Done!')
return
print('Found %d files!\n' % len(results))
if not os.path.isdir(os.path.join(args.target_dir, 'tmp')):
os.makedirs(os.path.join(args.target_dir, 'tmp'))
Download(args, results)
CopyAndDelete(args, results)
print('Done!')
if __name__ == '__main__':
main()