devtools/instalog/get_attachments.py - chromiumos/platform/factory - Git at Google

 #!/usr/bin/env python2
 #
 # Copyright 2017 The Chromium OS Authors. All rights reserved.
 # Use of this source code is governed by a BSD-style license that can be
 # found in the LICENSE file.

 """
 'get_attachments' is a tool to find Testlog attachment files from
 BigQuery / GCS across a given date range, and to save them to disk.
 """

 from __future__ import print_function

 import argparse
 import hashlib
 import json
 import os
 import shutil
 import subprocess
 import sys

 from six.moves import xrange


 DATE_FORMAT = '%Y%m%d%H%M%S'
 HASH_FILE_READ_BLOCK_SIZE = 1024 * 64  # 64kb
 GSUTIL_TEST_PERMISSION_PATH = 'gs://chromeos-localmirror-private/testing/'
 PROJECT_ID = 'chromeos-factory'
 DEFAULT_START_TIME = '1970-01-01'
 DEFAULT_END_TIME = '2100-12-31'
 DEFAULT_TARGET_DIR = 'factory_attachments'
 DEFAULT_BQ_PATH = 'bq'
 DEFAULT_GSUTIL_PATH = 'gsutil'
 DEFAULT_SERIAL_NUMBER_KEY = 'serial_number'


 def CheckVersion(args):
   print('Checking your gsutil version...')
   subprocess.check_call([args.gsutil_path, '--version'])
   print('Checking your Storage permission...')
   subprocess.check_call([args.gsutil_path, 'ls',
                          'gs://%s/%s' % (PROJECT_ID, args.dataset_id)])

   print('Checking your bq version...')
   subprocess.check_call([args.bq_path, 'version'])
   print('Checking your BigQuery permission and BigQuery dataset ID...')
   subprocess.check_call([args.bq_path, 'show',
                          '--project_id', PROJECT_ID,
                          '--dataset_id', args.dataset_id])
   print()


 def RunQuery(args):
   # We don't use parameterized query because the web UI doesn't support it.
   query_statement_words = [
       'SELECT',
       '    attachment.path AS remote,',
       '    FORMAT_TIMESTAMP("%s", time) AS server_time,',
       '    attachment.key AS attachment_key,',
       '    (',
       '        SELECT serialNumber.value',
       '        FROM UNNEST(data.serialNumbers) AS serialNumber',
       '        WHERE serialNumber.key="%s"',
       '    ) AS serial_number',
       'FROM',
       '    `%s.%s.testlog_events` AS data,',
       '    UNNEST(data.attachments) AS attachment',
       'WHERE',
       '    attachment.key LIKE "%%%s%%" AND',
       '    time >= TIMESTAMP("%s") AND',
       '    time < TIMESTAMP("%s")']
   query_statement = '\n'.join(query_statement_words)
   query = query_statement % (
       DATE_FORMAT, args.serial_number_key, PROJECT_ID, args.dataset_id,
       args.attachment_key, args.start_date, args.end_date)

   print('Execute the query, you can also copy-paste the following query to\n'
         'https://bigquery.cloud.google.com')
   print('-' * 37 + 'QUERY' + '-' * 38 + '\n' + query + '\n' + '-' * 80 + '\n')

   p = subprocess.Popen(
       [args.bq_path, 'query', '--max_rows', '1000000', '--nouse_legacy_sql',
        '--format', 'json'],
       stdin=subprocess.PIPE,
       stdout=subprocess.PIPE)
   result_json, stderrdata = p.communicate(query)
   if p.returncode:
     print('Query Failed!')
     print(stderrdata)
     sys.exit(1)
   print()

   return result_json


 def Filter(args, results):
   return [row
           for row in results
           if row['serial_number'] in args.serial_number]


 def Download(args, results):
   tmp_dir = os.path.join(args.target_dir, 'tmp')
   remote_list = []
   for row in results:
     row['tmp'] = os.path.join(tmp_dir, row['remote'].split('/')[-1])
     remote_list.append(row['remote'])

   # Remove duplicates in the remote_list.
   remote_list = list(set(remote_list))

   for i in xrange(0, len(remote_list), 100):
     print('Downloading the %d-%d of %s attachments...' %
           (i, min(i + 100, len(remote_list)) - 1, len(remote_list)))
     commands = [args.gsutil_path, '-m', 'cp', '-n']
     commands.extend(remote_list[i:i+100])
     commands.append(tmp_dir)
     subprocess.check_call(commands)


 def FileHash(path):
   file_hash = hashlib.md5()
   with open(path, 'rb') as f:
     for chunk in iter(lambda: f.read(HASH_FILE_READ_BLOCK_SIZE), ''):
       file_hash.update(chunk)
   return file_hash.hexdigest()


 def CopyAndDelete(args, results):
   tmp_dir = os.path.join(args.target_dir, 'tmp')
   for row in results:
     file_name = '%s_%s_%s_%s' % (row['server_time'], row['attachment_key'],
                                  row['serial_number'] or 'NoSerialNumber',
                                  FileHash(row['tmp']))
     local = os.path.join(args.target_dir, file_name)
     print(row['remote'] + ' --> ' + local)
     shutil.copyfile(row['tmp'], local)
   shutil.rmtree(tmp_dir)


 def main():
   parser = argparse.ArgumentParser(
       formatter_class=argparse.RawDescriptionHelpFormatter,
       description='Attachment downloader script',
       epilog='Common errors are permission and installation.\n'
              'Please check https://cloud.google.com/sdk/docs/ to\n'
              '  (1) update your gsutil to 4.26 and BigQuery CLI to 2.0.24\n'
              '  (2) authorize gcloud by "gcloud init" or "gcloud auth login"')
   parser.add_argument(
       'dataset_id',
       help='The BigQuery dataset ID.  Example: a01')
   parser.add_argument(
       'attachment_key',
       help='The attachment key.  Example: TESTID')
   parser.add_argument(
       '--start_date', '-s', default=DEFAULT_START_TIME,
       help='The start of date.  Default: %s' % DEFAULT_START_TIME)
   parser.add_argument(
       '--end_date', '-e', default=DEFAULT_END_TIME,
       help='The end of date.  Default: %s' % DEFAULT_END_TIME)
   parser.add_argument(
       '--target_dir', '-t', default=DEFAULT_TARGET_DIR,
       help='The target directory.  Default: %s' % DEFAULT_TARGET_DIR)
   parser.add_argument(
       '--bq_path', '-b', default=DEFAULT_BQ_PATH,
       help='The bq path.  Default: %s' % DEFAULT_BQ_PATH)
   parser.add_argument(
       '--gsutil_path', '-g', default=DEFAULT_GSUTIL_PATH,
       help='The gsutil path.  Default: %s' % DEFAULT_GSUTIL_PATH)
   parser.add_argument(
       '--serial_number_key', '-sn_key', default=DEFAULT_SERIAL_NUMBER_KEY,
       help='The key of the serial number to put in the file name.  '
            'Default: %s' % DEFAULT_SERIAL_NUMBER_KEY)
   parser.add_argument(
       '--serial_number', '-sn', type=str, action='append',
       help='The value of the serial number to download.  This can be added '
            'multiple times.  This feature is implemented by this script '
            'instead of query.')
   args = parser.parse_args()

   CheckVersion(args)

   result_json = RunQuery(args).strip()

   results = json.loads(result_json)
   if args.serial_number:
     results = Filter(args, results)

   if not results:
     print('Query returned zero records.\n'
           'Done!')
     return
   print('Found %d files!\n' % len(results))

   if not os.path.isdir(os.path.join(args.target_dir, 'tmp')):
     os.makedirs(os.path.join(args.target_dir, 'tmp'))

   Download(args, results)
   CopyAndDelete(args, results)

   print('Done!')


 if __name__ == '__main__':
   main()
	#!/usr/bin/env python2
	#
	# Copyright 2017 The Chromium OS Authors. All rights reserved.
	# Use of this source code is governed by a BSD-style license that can be
	# found in the LICENSE file.

	"""
	'get_attachments' is a tool to find Testlog attachment files from
	BigQuery / GCS across a given date range, and to save them to disk.
	"""

	from __future__ import print_function

	import argparse
	import hashlib
	import json
	import os
	import shutil
	import subprocess
	import sys

	from six.moves import xrange


	DATE_FORMAT = '%Y%m%d%H%M%S'
	HASH_FILE_READ_BLOCK_SIZE = 1024 * 64 # 64kb
	GSUTIL_TEST_PERMISSION_PATH = 'gs://chromeos-localmirror-private/testing/'
	PROJECT_ID = 'chromeos-factory'
	DEFAULT_START_TIME = '1970-01-01'
	DEFAULT_END_TIME = '2100-12-31'
	DEFAULT_TARGET_DIR = 'factory_attachments'
	DEFAULT_BQ_PATH = 'bq'
	DEFAULT_GSUTIL_PATH = 'gsutil'
	DEFAULT_SERIAL_NUMBER_KEY = 'serial_number'


	def CheckVersion(args):
	print('Checking your gsutil version...')
	subprocess.check_call([args.gsutil_path, '--version'])
	print('Checking your Storage permission...')
	subprocess.check_call([args.gsutil_path, 'ls',
	'gs://%s/%s' % (PROJECT_ID, args.dataset_id)])

	print('Checking your bq version...')
	subprocess.check_call([args.bq_path, 'version'])
	print('Checking your BigQuery permission and BigQuery dataset ID...')
	subprocess.check_call([args.bq_path, 'show',
	'--project_id', PROJECT_ID,
	'--dataset_id', args.dataset_id])
	print()


	def RunQuery(args):
	# We don't use parameterized query because the web UI doesn't support it.
	query_statement_words = [
	'SELECT',
	' attachment.path AS remote,',
	' FORMAT_TIMESTAMP("%s", time) AS server_time,',
	' attachment.key AS attachment_key,',
	' (',
	' SELECT serialNumber.value',
	' FROM UNNEST(data.serialNumbers) AS serialNumber',
	' WHERE serialNumber.key="%s"',
	' ) AS serial_number',
	'FROM',
	' `%s.%s.testlog_events` AS data,',
	' UNNEST(data.attachments) AS attachment',
	'WHERE',
	' attachment.key LIKE "%%%s%%" AND',
	' time >= TIMESTAMP("%s") AND',
	' time < TIMESTAMP("%s")']
	query_statement = '\n'.join(query_statement_words)
	query = query_statement % (
	DATE_FORMAT, args.serial_number_key, PROJECT_ID, args.dataset_id,
	args.attachment_key, args.start_date, args.end_date)

	print('Execute the query, you can also copy-paste the following query to\n'
	'https://bigquery.cloud.google.com')
	print('-' * 37 + 'QUERY' + '-' * 38 + '\n' + query + '\n' + '-' * 80 + '\n')

	p = subprocess.Popen(
	[args.bq_path, 'query', '--max_rows', '1000000', '--nouse_legacy_sql',
	'--format', 'json'],
	stdin=subprocess.PIPE,
	stdout=subprocess.PIPE)
	result_json, stderrdata = p.communicate(query)
	if p.returncode:
	print('Query Failed!')
	print(stderrdata)
	sys.exit(1)
	print()

	return result_json


	def Filter(args, results):
	return [row
	for row in results
	if row['serial_number'] in args.serial_number]


	def Download(args, results):
	tmp_dir = os.path.join(args.target_dir, 'tmp')
	remote_list = []
	for row in results:
	row['tmp'] = os.path.join(tmp_dir, row['remote'].split('/')[-1])
	remote_list.append(row['remote'])

	# Remove duplicates in the remote_list.
	remote_list = list(set(remote_list))

	for i in xrange(0, len(remote_list), 100):
	print('Downloading the %d-%d of %s attachments...' %
	(i, min(i + 100, len(remote_list)) - 1, len(remote_list)))
	commands = [args.gsutil_path, '-m', 'cp', '-n']
	commands.extend(remote_list[i:i+100])
	commands.append(tmp_dir)
	subprocess.check_call(commands)


	def FileHash(path):
	file_hash = hashlib.md5()
	with open(path, 'rb') as f:
	for chunk in iter(lambda: f.read(HASH_FILE_READ_BLOCK_SIZE), ''):
	file_hash.update(chunk)
	return file_hash.hexdigest()


	def CopyAndDelete(args, results):
	tmp_dir = os.path.join(args.target_dir, 'tmp')
	for row in results:
	file_name = '%s_%s_%s_%s' % (row['server_time'], row['attachment_key'],
	row['serial_number'] or 'NoSerialNumber',
	FileHash(row['tmp']))
	local = os.path.join(args.target_dir, file_name)
	print(row['remote'] + ' --> ' + local)
	shutil.copyfile(row['tmp'], local)
	shutil.rmtree(tmp_dir)


	def main():
	parser = argparse.ArgumentParser(
	formatter_class=argparse.RawDescriptionHelpFormatter,
	description='Attachment downloader script',
	epilog='Common errors are permission and installation.\n'
	'Please check https://cloud.google.com/sdk/docs/ to\n'
	' (1) update your gsutil to 4.26 and BigQuery CLI to 2.0.24\n'
	' (2) authorize gcloud by "gcloud init" or "gcloud auth login"')
	parser.add_argument(
	'dataset_id',
	help='The BigQuery dataset ID. Example: a01')
	parser.add_argument(
	'attachment_key',
	help='The attachment key. Example: TESTID')
	parser.add_argument(
	'--start_date', '-s', default=DEFAULT_START_TIME,
	help='The start of date. Default: %s' % DEFAULT_START_TIME)
	parser.add_argument(
	'--end_date', '-e', default=DEFAULT_END_TIME,
	help='The end of date. Default: %s' % DEFAULT_END_TIME)
	parser.add_argument(
	'--target_dir', '-t', default=DEFAULT_TARGET_DIR,
	help='The target directory. Default: %s' % DEFAULT_TARGET_DIR)
	parser.add_argument(
	'--bq_path', '-b', default=DEFAULT_BQ_PATH,
	help='The bq path. Default: %s' % DEFAULT_BQ_PATH)
	parser.add_argument(
	'--gsutil_path', '-g', default=DEFAULT_GSUTIL_PATH,
	help='The gsutil path. Default: %s' % DEFAULT_GSUTIL_PATH)
	parser.add_argument(
	'--serial_number_key', '-sn_key', default=DEFAULT_SERIAL_NUMBER_KEY,
	help='The key of the serial number to put in the file name. '
	'Default: %s' % DEFAULT_SERIAL_NUMBER_KEY)
	parser.add_argument(
	'--serial_number', '-sn', type=str, action='append',
	help='The value of the serial number to download. This can be added '
	'multiple times. This feature is implemented by this script '
	'instead of query.')
	args = parser.parse_args()

	CheckVersion(args)

	result_json = RunQuery(args).strip()

	results = json.loads(result_json)
	if args.serial_number:
	results = Filter(args, results)

	if not results:
	print('Query returned zero records.\n'
	'Done!')
	return
	print('Found %d files!\n' % len(results))

	if not os.path.isdir(os.path.join(args.target_dir, 'tmp')):
	os.makedirs(os.path.join(args.target_dir, 'tmp'))

	Download(args, results)
	CopyAndDelete(args, results)

	print('Done!')


	if __name__ == '__main__':
	main()