blob: 0beab70d2adc36d16b20312b677da00a8dceb751 [file] [log] [blame]
#!/usr/bin/env vpython3
# Copyright 2022 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Download all the LOBS in //site."""
import argparse
import hashlib
import io
import os
import sys
import time
import requests
import common
# This is used to hold a global requests.Session object so that the
# process can reuse a single connection across multiple requests.
http_session = None
def main():
parser = argparse.ArgumentParser()
parser.add_argument('-f', '--force', action='store_true')
parser.add_argument('-j', '--jobs', type=int, default=os.cpu_count())
args = parser.parse_args()
q = common.JobQueue(_download, args.jobs)
paths = [
path.replace('.sha1', '') for path in common.walk(common.SITE_DIR)
if path.endswith('.sha1')
]
for path in paths:
with open(os.path.join(common.SITE_DIR, path + '.sha1'), 'r') as fp:
expected_sha1 = fp.read().strip()
if not args.force and os.path.exists(os.path.join(
common.SITE_DIR, path)):
with open(os.path.join(common.SITE_DIR, path), 'rb') as fp:
s = hashlib.sha1()
s.update(fp.read())
actual_sha1 = s.hexdigest()
if args.force or (actual_sha1 != expected_sha1):
q.request(path, expected_sha1)
else:
q.request(path, expected_sha1)
if len(q.all_tasks()) == 0:
return 0
start = time.time()
updated = 0
failed = False
total_bytes = 0
for path, res, resp in q.results():
did_update, num_bytes = resp
if res:
print('%s failed: %s' % (path, res))
failed = True
if did_update:
updated += 1
total_bytes += num_bytes
end = time.time()
print('Fetched %d LOBs (%.1fMB) in %.3f seconds (%.1fMbps).' %
(updated, (total_bytes / 1_000_000), (end - start),
(total_bytes * 8 / (end - start) / 1_000_000)))
return 1 if failed else 0
def _url(expected_sha1):
return 'https://storage.googleapis.com/%s/%s' % (
'chromium-website-lob-storage', expected_sha1)
def _download(path, expected_sha1):
"""This routine downloads a given file if needed.
If there is no file at `path`, or if the file's SHA-1 hash doesn't
match the expected hash, we download it from the cloud storage bucket.
"""
# This is used to hold a global requests.Session object so that the
# process can reuse a single connection across multiple requests.
global http_session
if http_session is None:
http_session = requests.Session()
url = _url(expected_sha1)
for i in range(4):
try:
resp = http_session.get(url)
if resp.status_code != requests.codes.ok:
return (f'download failed with HTTP/{resp.status_code}',
(False, len(resp.content)))
s = hashlib.sha1()
s.update(resp.content)
actual_sha1 = s.hexdigest()
if actual_sha1 != expected_sha1:
return ('sha1 mismatch: expected %s, got %s' %
(expected_sha1, actual_sha1), (False,
len(resp.content)))
with open(os.path.join(common.SITE_DIR, path), 'wb') as fp:
fp.write(resp.content)
except (requests.HTTPError, requests.ConnectionError,
requests.Timeout) as e:
if i < 4:
time.sleep(1)
else:
return str(e), (False, 0)
except Exception as e:
return str(e), (False, 0)
return '', (True, len(resp.content))
if __name__ == '__main__':
sys.exit(main())