blob: 5e7f7e491a5840b81625624f2388b71199d10236 [file] [log] [blame]
#!/usr/bin/env vpython3
# Copyright 2021 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Export www.chromium.org to local files.
This script uses the Google GData and Google Sites APIs to extract the
content from http://www.chromium.org/ and write it into local files
that can be used to serve the same content.
The APIs are documented at
https://developers.google.com/sites/docs/1.0/developers_guide_protocol
https://developers.google.com/gdata/docs/json
Because www.chromium.org is a public site, this script requires no
authentication to work.
The exporting process attempts to convert the original content into
sane modern HTML as much as possible without changing the appearance
of any page significantly, with some minor exceptions.
"""
import argparse
import io
import json
import os
import pdb
import sys
import time
import traceback
from urllib.request import urlopen
from urllib.error import HTTPError, URLError
import yaml
import common
import html2markdown
def main():
parser = argparse.ArgumentParser()
parser.add_argument('--force', action='store_true',
help='ignore updated timestamps in local cache')
parser.add_argument('-t', '--test', action='store_true')
parser.add_argument('-r', '--raw', action='store_true')
parser.add_argument('-v', '--verbose', action='count')
parser.add_argument('--max_results', type=int, default=5000)
parser.add_argument('--start-index', type=int, default=1)
parser.add_argument('--path-list')
parser.add_argument('path', nargs='*')
args = parser.parse_args()
entries, parents = _entries(args)
if args.path:
paths_to_export = ['%s%s' % ('/' if not path.startswith('/') else '',
path)
for path in args.path]
elif args.path_list:
paths_to_export = common.read_paths(args.path_list)
else:
paths_to_export = []
max_input_mtime = max(os.stat(__file__).st_mtime,
os.stat(common.__file__).st_mtime,
os.stat(html2markdown.__file__).st_mtime)
updated = 0
paths = []
if args.test:
entry = _find_entry_by_path(paths_to_export[0], entries, parents)
if entry:
metadata = _metadata(entry, entries, parents)
path = _path(entry, entries, parents)
_ = _handle_entry(path,
(entry, metadata, max_input_mtime, args.force,
args.raw))
content = common.read_text_file('%s/%s.md' % (common.SOURCE_DIR,
path))
print(content)
return 0
else:
print('%s not found' % paths_to_export[0])
return 1
q = common.JobQueue(_handle_entry, common.cpu_count())
paths_to_export = set(paths_to_export)
for i, entry in enumerate(list(entries.values())[:args.max_results]):
if entry['kind'] in ('webpage', 'listpage', 'announcementspage', 'filecabinet'):
metadata = _metadata(entry, entries, parents)
path = _path(entry, entries, parents)
elif entry['kind'] == 'attachment':
metadata = {}
path = entry['url'].replace(
'https://sites.google.com/a/chromium.org/dev/', '')
else:
continue
if not paths_to_export or (
('/' + path).replace('/index', '') in paths_to_export):
q.request(path, (entry, metadata, max_input_mtime, args.force, False))
for path, res, did_update in q.results():
if did_update:
updated += 1
print('updated %d entries' % updated)
def _find_entry_by_path(path, entries, parents):
seen = set()
for entry in entries.values():
if entry['kind'] not in ('webpage', 'listpage', 'announcmentspage', 'filecabinet'):
continue
entry_path = _path(entry, entries, parents)
seen.add(entry_path)
if '/' + entry_path in (path, path + '/index'):
return entry
return None
def _handle_entry(task, obj):
entry, metadata, max_input_mtime, force, raw = obj
err = ''
did_update = False
yaml.SafeDumper.org_represent_str = yaml.SafeDumper.represent_str
if task in (
'developers/jinja',
'developers/polymer-1-0',
'devtools/breakpoints-tutorial/index.html',
'devtools/breakpoints-tutorial/script.js',
):
# TODO: Eleventy chokes on these files.
return '', False
def repr_str(dumper, data):
if '\n' in data:
return dumper.represent_scalar(u'tag:yaml.org,2002:str', data,
style='|')
return dumper.org_represent_str(data)
yaml.add_representer(str, repr_str, Dumper=yaml.SafeDumper)
mtime = _to_ts(entry['updated'])
if entry['kind'] in ('webpage',
'listpage',
'announcementspage',
'filecabinet'):
target_mtime = max(mtime, max_input_mtime)
path = '%s/%s.md' % (common.SOURCE_DIR, task)
if True or _needs_update(path, target_mtime, force):
if raw:
content = entry['content']
else:
content_sio = io.StringIO(entry['content'])
md_sio = io.StringIO()
md_sio.write('---\n')
md_sio.write(yaml.safe_dump(metadata))
md_sio.write('---\n\n')
url_converter = _URLConverter()
html2markdown.Convert(content_sio, md_sio, url_converter)
content = md_sio.getvalue()
content = content.replace(' \b\b\b\b', '')
did_update = common.write_if_changed(path, content.encode('utf-8'))
else:
did_update = False
elif entry['kind'] in ('announcement', 'listitem'):
# TODO: implement me.
pass
elif entry['kind'] == 'attachment':
path = '%s/%s' % (common.SOURCE_DIR, task)
if path in (
'site/developers/design-documents/network-stack/cookiemonster/CM-method-calls-new.png',
'site/developers/design-documents/cookie-split-loading/objects.png',
):
# These are expected 404's that we ignore.
did_update = False
elif _needs_update(path, mtime, force):
try:
fp = urlopen(entry['url'])
content = fp.read()
did_update = common.write_if_changed(path, content)
except (HTTPError, URLError, TimeoutError) as e:
err = 'Error: %s' % e
elif entry['kind'] == 'comment':
# ignore comments in the migration
pass
elif entry['kind'] == 'tag':
err = 'tag kind not implemented'
else:
err = 'unknown kind %s' % entry['kind']
return err, did_update
class _URLConverter:
def Translate(self, href):
if not href:
return ''
for path in common.alternates:
if href.startswith(path):
href = href.replace(path, '')
if href.startswith('/_/rsrc'):
href = '/' + '/'.join(href.split('/')[4:])
if '?' in href:
href = href[0:href.index('?')]
return href
def _path(entry, entries, parents):
path = entry['page_name']
if entry['id'] in parents:
path = path + '/index'
parent_id = entry.get('parent_id')
while parent_id:
path = entries[parent_id]['page_name'] + '/' + path
parent_id = entries[parent_id].get('parent_id')
return path
def _metadata(entry, entries, parents):
metadata = {}
metadata['page_name'] = entry['page_name']
metadata['title'] = entry['title']
crumbs = []
parent_id = entry.get('parent_id')
while parent_id:
parent = entries[parent_id]
path = '/' + _path(parent, entries, parents).replace('/index', '')
title = parent['title']
crumbs = [[path, title]] + crumbs
parent_id = parent.get('parent_id')
metadata['breadcrumbs'] = crumbs
if metadata['page_name'] in (
'chromium-projects',
'chromium',
):
metadata['use_title_as_h1'] = False
return metadata
def _needs_update(path, mtime, force):
if force:
return True
if os.path.exists(path):
st = os.stat(path)
return mtime > st.st_mtime
return True
def _entries(args):
entries = {}
parents = set()
# Looks like Sites probably caps results at 500 entries per request,
# even if we request more than that.
rownum = 0
url = ('https://sites.google.com/feeds/content/chromium.org/dev'
'?start-index=%d&max-results=%d&alt=json' %
(args.start_index, 500 - rownum))
doc, next_url = _fetch(url, args.force)
for rownum, entry in enumerate(doc['feed']['entry'], start=1):
row = _to_row(entry, rownum)
entries[row['id']] = row
if row.get('parent_id'):
parents.add(row['parent_id'])
if args.verbose:
print(' ... [%d]' % rownum)
while next_url:
doc, next_url = _fetch(next_url, args.force)
for rownum, entry in enumerate(doc['feed']['entry'], start=rownum):
row = _to_row(entry, rownum)
entries[row['id']] = row
if row.get('parent_id'):
parents.add(row['parent_id'])
if args.verbose:
print(' ... [%d]' % rownum)
return entries, parents
def _fetch(url, force):
path = url.replace('https://sites.google.com/feeds/', 'scripts/feeds/')
if _needs_update(path, 0, force):
fp = urlopen(url)
content = fp.read()
doc = json.loads(content)
updated = _to_ts(doc['feed']['updated']['$t'])
common.write_if_changed(path, content)
else:
with open(path) as fp:
doc = json.load(fp)
next_url = _find_link(doc['feed'], 'next')
return doc, next_url
def _find_link(doc, rel):
for ent in doc['link']:
if ent['rel'] == rel:
return ent['href']
return None
def _to_row(entry, rownum):
row = {
'rownum': rownum,
'content': entry.get('content', {}).get('$t'),
'id': _to_id(entry['id']['$t']),
'kind': entry['category'][0]['label'],
'published': entry['published']['$t'],
'updated': entry['updated']['$t'],
}
row['page_name'] = entry.get('sites$pageName', {}).get('$t')
row['title'] = entry.get('title', {}).get('$t')
row['alt_url'] = _find_link(entry, 'alternate')
if row['kind'] == 'attachment':
row['url'] = _find_link(entry, 'alternate')
else:
row['url'] = _find_link(entry, 'self')
parent_url = _find_link(entry,
'http://schemas.google.com/sites/2008#parent')
if parent_url:
row['parent_id'] = _to_id(parent_url)
return row
def _to_id(url):
return url[url.rfind('/') + 1:]
def _to_ts(iso_time):
return time.mktime(time.strptime(iso_time, '%Y-%m-%dT%H:%M:%S.%fZ'))
if __name__ == '__main__':
try:
main()
except Exception:
extype, value, tb = sys.exc_info()
traceback.print_exc()
pdb.post_mortem(tb)