blob: c10052e4c97f77e4f46801a78dceee27bfbd41d7 [file] [log] [blame]
#!/usr/bin/env vpython3
# Copyright 2022 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Download all the LOBS in //site."""
import argparse
import hashlib
import io
import os
import sys
import time
import urllib3
from urllib.error import HTTPError, URLError
import common
http = None
def main():
parser = argparse.ArgumentParser()
parser.add_argument('-f', '--force', action='store_true')
parser.add_argument('-j', '--jobs', type=int, default=common.cpu_count())
parser.add_argument('-m', '--multiprocess', action='store_true',
default=False)
args = parser.parse_args()
q = common.JobQueue(_handle, args.jobs, args.multiprocess)
paths = [path.replace('.sha1', '')
for path in common.walk(common.SITE_DIR)
if path.endswith('.sha1')]
stdin = ''
for path in paths:
with open(os.path.join(common.SITE_DIR, path + '.sha1'), 'r') as fp:
expected_sha1 = fp.read().strip()
if not args.force and os.path.exists(os.path.join(common.SITE_DIR, path)):
with open(os.path.join(common.SITE_DIR, path), 'rb') as fp:
s = hashlib.sha1()
s.update(fp.read())
actual_sha1 = s.hexdigest()
if args.force or (actual_sha1 != expected_sha1):
q.request(path, (args, expected_sha1))
else:
q.request(path, (args, expected_sha1))
if not len(q.all_tasks()):
return 0
start = time.time()
updated = 0
failed = False
total_bytes = 0
for path, res, resp in q.results():
did_update, num_bytes = resp
if res:
print('%s failed: %s' % (path, res))
failed = True
if did_update:
updated += 1
total_bytes += num_bytes
end = time.time()
print('Fetched %d LOBs (%.1fMB) in %.3f seconds (%.1fMbps).' %
(updated,
(total_bytes / 1_000_000),
(end - start),
(total_bytes * 8 / (end - start) / 1_000_000)))
return 1 if failed else 0
def _url(expected_sha1):
return 'https://storage.googleapis.com/%s/%s' % (
'chromium-website-lob-storage', expected_sha1)
def _handle(path, obj):
args, expected_sha1 = obj
global http
if http is None:
http = urllib3.PoolManager()
url = _url(expected_sha1)
total_bytes = 0
for i in range(4):
try:
resp = http.request('GET', url)
s = hashlib.sha1()
s.update(resp.data)
actual_sha1 = s.hexdigest()
if actual_sha1 != expected_sha1:
return ('sha1 mismatch: expected %s, got %s' % (
expected_sha1, actual_sha1), (False, len(resp.data)))
common.write_binary_file(os.path.join(common.SITE_DIR, path),
resp.data)
except (HTTPError, URLError, TimeoutError) as e:
if i < 4:
time.sleep(1)
else:
return str(e), (False, 0)
except Exception as e:
return str(e), (False, 0)
return '', (True, len(resp.data))
if __name__ == '__main__':
sys.exit(main())