scripts/export.py - experimental/website - Git at Google

 #!/usr/bin/env vpython3
 # Copyright 2021 Google LLC
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     https://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.

 """Export www.chromium.org to local files.

 This script uses the Google GData and Google Sites APIs to extract the
 content from http://www.chromium.org/ and write it into local files
 that can be used to serve the same content.

 The APIs are documented at

 https://developers.google.com/sites/docs/1.0/developers_guide_protocol
 https://developers.google.com/gdata/docs/json

 Because www.chromium.org is a public site, this script requires no
 authentication to work.

 The exporting process attempts to convert the original content into
 sane modern HTML as much as possible without changing the appearance
 of any page significantly, with some minor exceptions.
 """

 import argparse
 import io
 import json
 import os
 import pdb
 import sys
 import time
 import traceback
 from urllib.request import urlopen
 from urllib.error import HTTPError, URLError

 import yaml

 import common
 import html2markdown


 def main():
     parser = argparse.ArgumentParser()
     parser.add_argument('--force', action='store_true',
                         help='ignore updated timestamps in local cache')
     parser.add_argument('-t', '--test', action='store_true')
     parser.add_argument('-r', '--raw', action='store_true')
     parser.add_argument('-v', '--verbose', action='count')
     parser.add_argument('--max_results', type=int, default=5000)
     parser.add_argument('--start-index', type=int, default=1)
     parser.add_argument('--path-list')
     parser.add_argument('path', nargs='*')
     args = parser.parse_args()

     entries, parents = _entries(args)

     if args.path:
         paths_to_export = ['%s%s' % ('/' if not path.startswith('/') else '',
                                      path)
                            for path in args.path]
     elif args.path_list:
         paths_to_export = common.read_paths(args.path_list)
     else:
         paths_to_export = []

     max_input_mtime = max(os.stat(__file__).st_mtime,
                           os.stat(common.__file__).st_mtime,
                           os.stat(html2markdown.__file__).st_mtime)

     updated = 0
     paths = []

     if args.test:
         entry = _find_entry_by_path(paths_to_export[0], entries, parents)
         if entry:
             metadata = _metadata(entry, entries, parents)
             path = _path(entry, entries, parents)
             _ = _handle_entry(path,
                               (entry, metadata, max_input_mtime, args.force,
                                args.raw))
             content = common.read_text_file('%s/%s.md' % (common.SOURCE_DIR,
                                                           path))
             print(content)
             return 0
         else:
             print('%s not found' % paths_to_export[0])
             return 1

     q = common.JobQueue(_handle_entry, common.cpu_count())

     paths_to_export = set(paths_to_export)
     for i, entry in enumerate(list(entries.values())[:args.max_results]):
         if entry['kind'] in ('webpage', 'listpage', 'announcementspage', 'filecabinet'):
             metadata = _metadata(entry, entries, parents)
             path = _path(entry, entries, parents)
         elif entry['kind'] == 'attachment':
             metadata = {}
             path = entry['url'].replace(
                  'https://sites.google.com/a/chromium.org/dev/', '')
         else:
             continue

         if not paths_to_export or (
             ('/' + path).replace('/index', '') in paths_to_export):
             q.request(path, (entry, metadata, max_input_mtime, args.force, False))

     for path, res, did_update in q.results():
         if did_update:
             updated += 1

     print('updated %d entries' % updated)


 def _find_entry_by_path(path, entries, parents):
     seen = set()
     for entry in entries.values():
         if entry['kind'] not in ('webpage', 'listpage', 'announcmentspage', 'filecabinet'):
             continue
         entry_path = _path(entry, entries, parents)
         seen.add(entry_path)
         if '/' + entry_path in (path, path + '/index'):
             return entry
     return None


 def _handle_entry(task, obj):
     entry, metadata, max_input_mtime, force, raw = obj
     err = ''
     did_update = False

     yaml.SafeDumper.org_represent_str = yaml.SafeDumper.represent_str

     if task in (
         'developers/jinja',
         'developers/polymer-1-0',
         'devtools/breakpoints-tutorial/index.html',
         'devtools/breakpoints-tutorial/script.js',
         ):
         # TODO: Eleventy chokes on these files.
         return '', False

     def repr_str(dumper, data):
         if '\n' in data:
             return dumper.represent_scalar(u'tag:yaml.org,2002:str', data,
                                            style='|')
         return dumper.org_represent_str(data)

     yaml.add_representer(str, repr_str, Dumper=yaml.SafeDumper)


     mtime = _to_ts(entry['updated'])
     if entry['kind'] in ('webpage',
                          'listpage',
                          'announcementspage',
                          'filecabinet'):
         target_mtime = max(mtime, max_input_mtime)
         path = '%s/%s.md' % (common.SOURCE_DIR, task)
         if True or _needs_update(path, target_mtime, force):
             if raw:
                 content = entry['content']
             else:
                 content_sio = io.StringIO(entry['content'])
                 md_sio = io.StringIO()
                 md_sio.write('---\n')
                 md_sio.write(yaml.safe_dump(metadata))
                 md_sio.write('---\n\n')
                 url_converter = _URLConverter()
                 html2markdown.Convert(content_sio, md_sio, url_converter)
                 content = md_sio.getvalue()
                 content = content.replace('    \b\b\b\b', '')
             did_update = common.write_if_changed(path, content.encode('utf-8'))
         else:
             did_update = False
     elif entry['kind'] in ('announcement', 'listitem'):
         # TODO: implement me.
         pass
     elif entry['kind'] == 'attachment':
         path = '%s/%s' % (common.SOURCE_DIR, task)
         if path in (
             'site/developers/design-documents/network-stack/cookiemonster/CM-method-calls-new.png',
             'site/developers/design-documents/cookie-split-loading/objects.png',
         ):
             # These are expected 404's that we ignore.
             did_update = False
         elif _needs_update(path, mtime, force):
             try:
                 fp = urlopen(entry['url'])
                 content = fp.read()
                 did_update = common.write_if_changed(path, content)
             except (HTTPError, URLError, TimeoutError) as e:
                 err = 'Error: %s' % e

     elif entry['kind'] == 'comment':
         # ignore comments in the migration
         pass
     elif entry['kind'] == 'tag':
         err = 'tag kind not implemented'
     else:
         err = 'unknown kind %s' % entry['kind']

     return err, did_update


 class _URLConverter:
     def Translate(self, href):
         if not href:
             return ''

         for path in common.alternates:
             if href.startswith(path):
                 href = href.replace(path, '')

         if href.startswith('/_/rsrc'):
             href = '/' + '/'.join(href.split('/')[4:])
         if '?' in href:
             href = href[0:href.index('?')]
         return href


 def _path(entry, entries, parents):
     path = entry['page_name']
     if entry['id'] in parents:
         path = path + '/index'
     parent_id = entry.get('parent_id')
     while parent_id:
         path = entries[parent_id]['page_name'] + '/' + path
         parent_id = entries[parent_id].get('parent_id')

     return path


 def _metadata(entry, entries, parents):
     metadata = {}
     metadata['page_name'] = entry['page_name']
     metadata['title'] = entry['title']

     crumbs = []
     parent_id = entry.get('parent_id')
     while parent_id:
         parent = entries[parent_id]
         path = '/' + _path(parent, entries, parents).replace('/index', '')
         title = parent['title']
         crumbs = [[path, title]] + crumbs
         parent_id = parent.get('parent_id')

     metadata['breadcrumbs'] = crumbs

     if metadata['page_name'] in (
         'chromium-projects',
         'chromium',
     ):
         metadata['use_title_as_h1'] = False

     return metadata


 def _needs_update(path, mtime, force):
     if force:
         return True
     if os.path.exists(path):
         st = os.stat(path)
         return mtime > st.st_mtime
     return True


 def _entries(args):
     entries = {}
     parents = set()

     # Looks like Sites probably caps results at 500 entries per request,
     # even if we request more than that.
     rownum = 0
     url = ('https://sites.google.com/feeds/content/chromium.org/dev'
            '?start-index=%d&max-results=%d&alt=json' %
                (args.start_index, 500 - rownum))
     doc, next_url = _fetch(url, args.force)

     for rownum, entry in enumerate(doc['feed']['entry'], start=1):
         row = _to_row(entry, rownum)
         entries[row['id']] = row
         if row.get('parent_id'):
             parents.add(row['parent_id'])
     if args.verbose:
         print(' ... [%d]' % rownum)
     while next_url:
         doc, next_url = _fetch(next_url, args.force)
         for rownum, entry in enumerate(doc['feed']['entry'], start=rownum):
             row = _to_row(entry, rownum)
             entries[row['id']] = row
             if row.get('parent_id'):
                 parents.add(row['parent_id'])
         if args.verbose:
             print(' ... [%d]' % rownum)

     return entries, parents


 def _fetch(url, force):
     path = url.replace('https://sites.google.com/feeds/', 'scripts/feeds/')
     if _needs_update(path, 0, force):
         fp = urlopen(url)
         content = fp.read()
         doc = json.loads(content)
         updated = _to_ts(doc['feed']['updated']['$t'])
         common.write_if_changed(path, content)
     else:
         with open(path) as fp:
             doc = json.load(fp)
     next_url = _find_link(doc['feed'], 'next')
     return doc, next_url


 def _find_link(doc, rel):
     for ent in doc['link']:
         if ent['rel'] == rel:
             return ent['href']
     return None


 def _to_row(entry, rownum):
     row = {
         'rownum': rownum,
         'content': entry.get('content', {}).get('$t'),
         'id': _to_id(entry['id']['$t']),
         'kind': entry['category'][0]['label'],
         'published': entry['published']['$t'],
         'updated': entry['updated']['$t'],
     }

     row['page_name'] = entry.get('sites$pageName', {}).get('$t')
     row['title'] = entry.get('title', {}).get('$t')
     row['alt_url'] = _find_link(entry, 'alternate')

     if row['kind'] == 'attachment':
         row['url'] = _find_link(entry, 'alternate')
     else:
         row['url'] = _find_link(entry, 'self')

     parent_url = _find_link(entry,
                             'http://schemas.google.com/sites/2008#parent')
     if parent_url:
         row['parent_id'] = _to_id(parent_url)
     return row


 def _to_id(url):
     return url[url.rfind('/') + 1:]


 def _to_ts(iso_time):
     return time.mktime(time.strptime(iso_time, '%Y-%m-%dT%H:%M:%S.%fZ'))

 if __name__ == '__main__':
     try:
         main()
     except Exception:
         extype, value, tb = sys.exc_info()
         traceback.print_exc()
         pdb.post_mortem(tb)
	#!/usr/bin/env vpython3
	# Copyright 2021 Google LLC
	#
	# Licensed under the Apache License, Version 2.0 (the "License");
	# you may not use this file except in compliance with the License.
	# You may obtain a copy of the License at
	#
	# https://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.

	"""Export www.chromium.org to local files.

	This script uses the Google GData and Google Sites APIs to extract the
	content from http://www.chromium.org/ and write it into local files
	that can be used to serve the same content.

	The APIs are documented at

	https://developers.google.com/sites/docs/1.0/developers_guide_protocol
	https://developers.google.com/gdata/docs/json

	Because www.chromium.org is a public site, this script requires no
	authentication to work.

	The exporting process attempts to convert the original content into
	sane modern HTML as much as possible without changing the appearance
	of any page significantly, with some minor exceptions.
	"""

	import argparse
	import io
	import json
	import os
	import pdb
	import sys
	import time
	import traceback
	from urllib.request import urlopen
	from urllib.error import HTTPError, URLError

	import yaml

	import common
	import html2markdown


	def main():
	parser = argparse.ArgumentParser()
	parser.add_argument('--force', action='store_true',
	help='ignore updated timestamps in local cache')
	parser.add_argument('-t', '--test', action='store_true')
	parser.add_argument('-r', '--raw', action='store_true')
	parser.add_argument('-v', '--verbose', action='count')
	parser.add_argument('--max_results', type=int, default=5000)
	parser.add_argument('--start-index', type=int, default=1)
	parser.add_argument('--path-list')
	parser.add_argument('path', nargs='*')
	args = parser.parse_args()

	entries, parents = _entries(args)

	if args.path:
	paths_to_export = ['%s%s' % ('/' if not path.startswith('/') else '',
	path)
	for path in args.path]
	elif args.path_list:
	paths_to_export = common.read_paths(args.path_list)
	else:
	paths_to_export = []

	max_input_mtime = max(os.stat(__file__).st_mtime,
	os.stat(common.__file__).st_mtime,
	os.stat(html2markdown.__file__).st_mtime)

	updated = 0
	paths = []

	if args.test:
	entry = _find_entry_by_path(paths_to_export[0], entries, parents)
	if entry:
	metadata = _metadata(entry, entries, parents)
	path = _path(entry, entries, parents)
	_ = _handle_entry(path,
	(entry, metadata, max_input_mtime, args.force,
	args.raw))
	content = common.read_text_file('%s/%s.md' % (common.SOURCE_DIR,
	path))
	print(content)
	return 0
	else:
	print('%s not found' % paths_to_export[0])
	return 1

	q = common.JobQueue(_handle_entry, common.cpu_count())

	paths_to_export = set(paths_to_export)
	for i, entry in enumerate(list(entries.values())[:args.max_results]):
	if entry['kind'] in ('webpage', 'listpage', 'announcementspage', 'filecabinet'):
	metadata = _metadata(entry, entries, parents)
	path = _path(entry, entries, parents)
	elif entry['kind'] == 'attachment':
	metadata = {}
	path = entry['url'].replace(
	'https://sites.google.com/a/chromium.org/dev/', '')
	else:
	continue

	if not paths_to_export or (
	('/' + path).replace('/index', '') in paths_to_export):
	q.request(path, (entry, metadata, max_input_mtime, args.force, False))

	for path, res, did_update in q.results():
	if did_update:
	updated += 1

	print('updated %d entries' % updated)


	def _find_entry_by_path(path, entries, parents):
	seen = set()
	for entry in entries.values():
	if entry['kind'] not in ('webpage', 'listpage', 'announcmentspage', 'filecabinet'):
	continue
	entry_path = _path(entry, entries, parents)
	seen.add(entry_path)
	if '/' + entry_path in (path, path + '/index'):
	return entry
	return None


	def _handle_entry(task, obj):
	entry, metadata, max_input_mtime, force, raw = obj
	err = ''
	did_update = False

	yaml.SafeDumper.org_represent_str = yaml.SafeDumper.represent_str

	if task in (
	'developers/jinja',
	'developers/polymer-1-0',
	'devtools/breakpoints-tutorial/index.html',
	'devtools/breakpoints-tutorial/script.js',
	):
	# TODO: Eleventy chokes on these files.
	return '', False

	def repr_str(dumper, data):
	if '\n' in data:
	return dumper.represent_scalar(u'tag:yaml.org,2002:str', data,
	style='\|')
	return dumper.org_represent_str(data)

	yaml.add_representer(str, repr_str, Dumper=yaml.SafeDumper)


	mtime = _to_ts(entry['updated'])
	if entry['kind'] in ('webpage',
	'listpage',
	'announcementspage',
	'filecabinet'):
	target_mtime = max(mtime, max_input_mtime)
	path = '%s/%s.md' % (common.SOURCE_DIR, task)
	if True or _needs_update(path, target_mtime, force):
	if raw:
	content = entry['content']
	else:
	content_sio = io.StringIO(entry['content'])
	md_sio = io.StringIO()
	md_sio.write('---\n')
	md_sio.write(yaml.safe_dump(metadata))
	md_sio.write('---\n\n')
	url_converter = _URLConverter()
	html2markdown.Convert(content_sio, md_sio, url_converter)
	content = md_sio.getvalue()
	content = content.replace(' \b\b\b\b', '')
	did_update = common.write_if_changed(path, content.encode('utf-8'))
	else:
	did_update = False
	elif entry['kind'] in ('announcement', 'listitem'):
	# TODO: implement me.
	pass
	elif entry['kind'] == 'attachment':
	path = '%s/%s' % (common.SOURCE_DIR, task)
	if path in (
	'site/developers/design-documents/network-stack/cookiemonster/CM-method-calls-new.png',
	'site/developers/design-documents/cookie-split-loading/objects.png',
	):
	# These are expected 404's that we ignore.
	did_update = False
	elif _needs_update(path, mtime, force):
	try:
	fp = urlopen(entry['url'])
	content = fp.read()
	did_update = common.write_if_changed(path, content)
	except (HTTPError, URLError, TimeoutError) as e:
	err = 'Error: %s' % e

	elif entry['kind'] == 'comment':
	# ignore comments in the migration
	pass
	elif entry['kind'] == 'tag':
	err = 'tag kind not implemented'
	else:
	err = 'unknown kind %s' % entry['kind']

	return err, did_update


	class _URLConverter:
	def Translate(self, href):
	if not href:
	return ''

	for path in common.alternates:
	if href.startswith(path):
	href = href.replace(path, '')

	if href.startswith('/_/rsrc'):
	href = '/' + '/'.join(href.split('/')[4:])
	if '?' in href:
	href = href[0:href.index('?')]
	return href


	def _path(entry, entries, parents):
	path = entry['page_name']
	if entry['id'] in parents:
	path = path + '/index'
	parent_id = entry.get('parent_id')
	while parent_id:
	path = entries[parent_id]['page_name'] + '/' + path
	parent_id = entries[parent_id].get('parent_id')

	return path


	def _metadata(entry, entries, parents):
	metadata = {}
	metadata['page_name'] = entry['page_name']
	metadata['title'] = entry['title']

	crumbs = []
	parent_id = entry.get('parent_id')
	while parent_id:
	parent = entries[parent_id]
	path = '/' + _path(parent, entries, parents).replace('/index', '')
	title = parent['title']
	crumbs = [[path, title]] + crumbs
	parent_id = parent.get('parent_id')

	metadata['breadcrumbs'] = crumbs

	if metadata['page_name'] in (
	'chromium-projects',
	'chromium',
	):
	metadata['use_title_as_h1'] = False

	return metadata


	def _needs_update(path, mtime, force):
	if force:
	return True
	if os.path.exists(path):
	st = os.stat(path)
	return mtime > st.st_mtime
	return True


	def _entries(args):
	entries = {}
	parents = set()

	# Looks like Sites probably caps results at 500 entries per request,
	# even if we request more than that.
	rownum = 0
	url = ('https://sites.google.com/feeds/content/chromium.org/dev'
	'?start-index=%d&max-results=%d&alt=json' %
	(args.start_index, 500 - rownum))
	doc, next_url = _fetch(url, args.force)

	for rownum, entry in enumerate(doc['feed']['entry'], start=1):
	row = _to_row(entry, rownum)
	entries[row['id']] = row
	if row.get('parent_id'):
	parents.add(row['parent_id'])
	if args.verbose:
	print(' ... [%d]' % rownum)
	while next_url:
	doc, next_url = _fetch(next_url, args.force)
	for rownum, entry in enumerate(doc['feed']['entry'], start=rownum):
	row = _to_row(entry, rownum)
	entries[row['id']] = row
	if row.get('parent_id'):
	parents.add(row['parent_id'])
	if args.verbose:
	print(' ... [%d]' % rownum)

	return entries, parents


	def _fetch(url, force):
	path = url.replace('https://sites.google.com/feeds/', 'scripts/feeds/')
	if _needs_update(path, 0, force):
	fp = urlopen(url)
	content = fp.read()
	doc = json.loads(content)
	updated = _to_ts(doc['feed']['updated']['$t'])
	common.write_if_changed(path, content)
	else:
	with open(path) as fp:
	doc = json.load(fp)
	next_url = _find_link(doc['feed'], 'next')
	return doc, next_url


	def _find_link(doc, rel):
	for ent in doc['link']:
	if ent['rel'] == rel:
	return ent['href']
	return None


	def _to_row(entry, rownum):
	row = {
	'rownum': rownum,
	'content': entry.get('content', {}).get('$t'),
	'id': _to_id(entry['id']['$t']),
	'kind': entry['category'][0]['label'],
	'published': entry['published']['$t'],
	'updated': entry['updated']['$t'],
	}

	row['page_name'] = entry.get('sites$pageName', {}).get('$t')
	row['title'] = entry.get('title', {}).get('$t')
	row['alt_url'] = _find_link(entry, 'alternate')

	if row['kind'] == 'attachment':
	row['url'] = _find_link(entry, 'alternate')
	else:
	row['url'] = _find_link(entry, 'self')

	parent_url = _find_link(entry,
	'http://schemas.google.com/sites/2008#parent')
	if parent_url:
	row['parent_id'] = _to_id(parent_url)
	return row


	def _to_id(url):
	return url[url.rfind('/') + 1:]


	def _to_ts(iso_time):
	return time.mktime(time.strptime(iso_time, '%Y-%m-%dT%H:%M:%S.%fZ'))

	if __name__ == '__main__':
	try:
	main()
	except Exception:
	extype, value, tb = sys.exc_info()
	traceback.print_exc()
	pdb.post_mortem(tb)