| #!/usr/bin/env vpython3 |
| # Copyright 2021 Google LLC |
| # |
| # Licensed under the Apache License, Version 2.0 (the "License"); |
| # you may not use this file except in compliance with the License. |
| # You may obtain a copy of the License at |
| # |
| # https://www.apache.org/licenses/LICENSE-2.0 |
| # |
| # Unless required by applicable law or agreed to in writing, software |
| # distributed under the License is distributed on an "AS IS" BASIS, |
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| # See the License for the specific language governing permissions and |
| # limitations under the License. |
| |
| """Export www.chromium.org to local files. |
| |
| This script uses the Google GData and Google Sites APIs to extract the |
| content from http://www.chromium.org/ and write it into local files |
| that can be used to serve the same content. |
| |
| The APIs are documented at |
| |
| https://developers.google.com/sites/docs/1.0/developers_guide_protocol |
| https://developers.google.com/gdata/docs/json |
| |
| Because www.chromium.org is a public site, this script requires no |
| authentication to work. |
| |
| The exporting process attempts to convert the original content into |
| sane modern HTML as much as possible without changing the appearance |
| of any page significantly, with some minor exceptions. |
| """ |
| |
| import argparse |
| import io |
| import json |
| import os |
| import pdb |
| import sys |
| import time |
| import traceback |
| from urllib.request import urlopen |
| from urllib.error import HTTPError, URLError |
| |
| import yaml |
| |
| import common |
| import html2markdown |
| |
| |
| def main(): |
| parser = argparse.ArgumentParser() |
| parser.add_argument('--force', action='store_true', |
| help='ignore updated timestamps in local cache') |
| parser.add_argument('-t', '--test', action='store_true') |
| parser.add_argument('-r', '--raw', action='store_true') |
| parser.add_argument('-v', '--verbose', action='count') |
| parser.add_argument('--max_results', type=int, default=5000) |
| parser.add_argument('--start-index', type=int, default=1) |
| parser.add_argument('--path-list') |
| parser.add_argument('path', nargs='*') |
| args = parser.parse_args() |
| |
| entries, parents = _entries(args) |
| |
| if args.path: |
| paths_to_export = ['%s%s' % ('/' if not path.startswith('/') else '', |
| path) |
| for path in args.path] |
| elif args.path_list: |
| paths_to_export = common.read_paths(args.path_list) |
| else: |
| paths_to_export = [] |
| |
| max_input_mtime = max(os.stat(__file__).st_mtime, |
| os.stat(common.__file__).st_mtime, |
| os.stat(html2markdown.__file__).st_mtime) |
| |
| updated = 0 |
| paths = [] |
| |
| if args.test: |
| entry = _find_entry_by_path(paths_to_export[0], entries, parents) |
| if entry: |
| metadata = _metadata(entry, entries, parents) |
| path = _path(entry, entries, parents) |
| _ = _handle_entry(path, |
| (entry, metadata, max_input_mtime, args.force, |
| args.raw)) |
| content = common.read_text_file('%s/%s.md' % (common.SOURCE_DIR, |
| path)) |
| print(content) |
| return 0 |
| else: |
| print('%s not found' % paths_to_export[0]) |
| return 1 |
| |
| q = common.JobQueue(_handle_entry, common.cpu_count()) |
| |
| paths_to_export = set(paths_to_export) |
| for i, entry in enumerate(list(entries.values())[:args.max_results]): |
| if entry['kind'] in ('webpage', 'listpage', 'announcementspage', 'filecabinet'): |
| metadata = _metadata(entry, entries, parents) |
| path = _path(entry, entries, parents) |
| elif entry['kind'] == 'attachment': |
| metadata = {} |
| path = entry['url'].replace( |
| 'https://sites.google.com/a/chromium.org/dev/', '') |
| else: |
| continue |
| |
| if not paths_to_export or ( |
| ('/' + path).replace('/index', '') in paths_to_export): |
| q.request(path, (entry, metadata, max_input_mtime, args.force, False)) |
| |
| for path, res, did_update in q.results(): |
| if did_update: |
| updated += 1 |
| |
| print('updated %d entries' % updated) |
| |
| |
| def _find_entry_by_path(path, entries, parents): |
| seen = set() |
| for entry in entries.values(): |
| if entry['kind'] not in ('webpage', 'listpage', 'announcmentspage', 'filecabinet'): |
| continue |
| entry_path = _path(entry, entries, parents) |
| seen.add(entry_path) |
| if '/' + entry_path in (path, path + '/index'): |
| return entry |
| return None |
| |
| |
| def _handle_entry(task, obj): |
| entry, metadata, max_input_mtime, force, raw = obj |
| err = '' |
| did_update = False |
| |
| yaml.SafeDumper.org_represent_str = yaml.SafeDumper.represent_str |
| |
| if task in ( |
| 'developers/jinja', |
| 'developers/polymer-1-0', |
| 'devtools/breakpoints-tutorial/index.html'): |
| # TODO: Eleventy chokes on these files. |
| return '', False |
| |
| def repr_str(dumper, data): |
| if '\n' in data: |
| return dumper.represent_scalar(u'tag:yaml.org,2002:str', data, |
| style='|') |
| return dumper.org_represent_str(data) |
| |
| yaml.add_representer(str, repr_str, Dumper=yaml.SafeDumper) |
| |
| |
| mtime = _to_ts(entry['updated']) |
| if entry['kind'] in ('webpage', |
| 'listpage', |
| 'announcementspage', |
| 'filecabinet'): |
| target_mtime = max(mtime, max_input_mtime) |
| path = '%s/%s.md' % (common.SOURCE_DIR, task) |
| if True or _needs_update(path, target_mtime, force): |
| if raw: |
| content = entry['content'] |
| else: |
| content_sio = io.StringIO(entry['content']) |
| md_sio = io.StringIO() |
| md_sio.write('---\n') |
| md_sio.write(yaml.safe_dump(metadata) + '\n') |
| md_sio.write('---\n') |
| url_converter = _URLConverter() |
| html2markdown.Convert(content_sio, md_sio, url_converter) |
| content = md_sio.getvalue() |
| content = content.replace(' \b\b\b\b', '') |
| did_update = common.write_if_changed(path, content.encode('utf-8')) |
| else: |
| did_update = False |
| elif entry['kind'] in ('announcement', 'listitem'): |
| # TODO: implement me. |
| pass |
| elif entry['kind'] == 'attachment': |
| path = '%s/%s' % (common.SOURCE_DIR, task) |
| if path in ( |
| 'site/developers/design-documents/network-stack/cookiemonster/CM-method-calls-new.png', |
| 'site/developers/design-documents/cookie-split-loading/objects.png', |
| ): |
| # These are expected 404's that we ignore. |
| did_update = False |
| elif _needs_update(path, mtime, force): |
| try: |
| fp = urlopen(entry['url']) |
| content = fp.read() |
| did_update = common.write_if_changed(path, content) |
| except (HTTPError, URLError, TimeoutError) as e: |
| err = 'Error: %s' % e |
| |
| elif entry['kind'] == 'comment': |
| # ignore comments in the migration |
| pass |
| elif entry['kind'] == 'tag': |
| err = 'tag kind not implemented' |
| else: |
| err = 'unknown kind %s' % entry['kind'] |
| |
| return err, did_update |
| |
| |
| class _URLConverter: |
| def Translate(self, href): |
| if not href: |
| return '' |
| |
| for path in common.alternates: |
| if href.startswith(path): |
| href = href.replace(path, '') |
| return href |
| |
| |
| def _path(entry, entries, parents): |
| path = entry['page_name'] |
| if entry['id'] in parents: |
| path = path + '/index' |
| parent_id = entry.get('parent_id') |
| while parent_id: |
| path = entries[parent_id]['page_name'] + '/' + path |
| parent_id = entries[parent_id].get('parent_id') |
| |
| return path |
| |
| |
| def _metadata(entry, entries, parents): |
| metadata = {} |
| metadata['page_name'] = entry['page_name'] |
| metadata['title'] = entry['title'] |
| |
| crumbs = [] |
| parent = entry.get('parent_id') |
| while parent: |
| path = _path(entries[parent], entries, parents) |
| crumbs.insert(0, (path, entries[parent]['title'])) |
| parent = entries.get('parent_id') |
| |
| if crumbs: |
| crumb_str = '<div id="title-crumbs">' |
| for href, title in crumbs: |
| crumb_str += '<a href="%s">%s</a> > ' % (href, title) |
| crumb_str += '</div>' |
| metadata['breadcrumbs'] = crumb_str |
| else: |
| metadata['breadcrumbs'] = '' |
| |
| return metadata |
| |
| |
| def _needs_update(path, mtime, force): |
| if force: |
| return True |
| if os.path.exists(path): |
| st = os.stat(path) |
| return mtime > st.st_mtime |
| return True |
| |
| |
| def _entries(args): |
| entries = {} |
| parents = set() |
| |
| # Looks like Sites probably caps results at 500 entries per request, |
| # even if we request more than that. |
| rownum = 0 |
| url = ('https://sites.google.com/feeds/content/chromium.org/dev' |
| '?start-index=%d&max-results=%d&alt=json' % |
| (args.start_index, 500 - rownum)) |
| doc, next_url = _fetch(url, args.force) |
| |
| for rownum, entry in enumerate(doc['feed']['entry'], start=1): |
| row = _to_row(entry, rownum) |
| entries[row['id']] = row |
| if row.get('parent_id'): |
| parents.add(row['parent_id']) |
| if args.verbose: |
| print(' ... [%d]' % rownum) |
| while next_url: |
| doc, next_url = _fetch(next_url, args.force) |
| for rownum, entry in enumerate(doc['feed']['entry'], start=rownum): |
| row = _to_row(entry, rownum) |
| entries[row['id']] = row |
| if row.get('parent_id'): |
| parents.add(row['parent_id']) |
| if args.verbose: |
| print(' ... [%d]' % rownum) |
| |
| return entries, parents |
| |
| |
| def _fetch(url, force): |
| path = url.replace('https://sites.google.com/feeds/', 'scripts/feeds/') |
| if _needs_update(path, 0, force): |
| fp = urlopen(url) |
| content = fp.read() |
| doc = json.loads(content) |
| updated = _to_ts(doc['feed']['updated']['$t']) |
| common.write_if_changed(path, content) |
| else: |
| with open(path) as fp: |
| doc = json.load(fp) |
| next_url = _find_link(doc['feed'], 'next') |
| return doc, next_url |
| |
| |
| def _find_link(doc, rel): |
| for ent in doc['link']: |
| if ent['rel'] == rel: |
| return ent['href'] |
| return None |
| |
| |
| def _to_row(entry, rownum): |
| row = { |
| 'rownum': rownum, |
| 'content': entry.get('content', {}).get('$t'), |
| 'id': _to_id(entry['id']['$t']), |
| 'kind': entry['category'][0]['label'], |
| 'published': entry['published']['$t'], |
| 'updated': entry['updated']['$t'], |
| } |
| |
| row['page_name'] = entry.get('sites$pageName', {}).get('$t') |
| row['title'] = entry.get('title', {}).get('$t') |
| row['alt_url'] = _find_link(entry, 'alternate') |
| |
| if row['kind'] == 'attachment': |
| row['url'] = _find_link(entry, 'alternate') |
| else: |
| row['url'] = _find_link(entry, 'self') |
| |
| parent_url = _find_link(entry, |
| 'http://schemas.google.com/sites/2008#parent') |
| if parent_url: |
| row['parent_id'] = _to_id(parent_url) |
| return row |
| |
| |
| def _to_id(url): |
| return url[url.rfind('/') + 1:] |
| |
| |
| def _to_ts(iso_time): |
| return time.mktime(time.strptime(iso_time, '%Y-%m-%dT%H:%M:%S.%fZ')) |
| |
| if __name__ == '__main__': |
| try: |
| main() |
| except Exception: |
| extype, value, tb = sys.exc_info() |
| traceback.print_exc() |
| pdb.post_mortem(tb) |