build/markdown_utils.py - enterprise/cel - Git at Google

 #!/usr/bin/env python

 # Copyright 2017 The Chromium Authors. All rights reserved.
 # Use of this source code is governed by a BSD-style license that can be
 # found in the LICENSE file.

 from __future__ import print_function

 import difflib
 import logging
 import os
 import re
 import textwrap


 def ProcessIncludesInContent(lines, fname):
   '''ProcessIncludesInContent expands any INCLUDE directives found in a markdown file.

   Parameters:
     lines : a list of strings with line endings that represent the contents
             of the Markdown file.
     fname : the name of the file including the path if applicable. Includes
             are resolved relative to this path.

   Include directives take the form:

       <!-- INCLUDE <relative_path> [(<line_count> lines)] [fenced as <fence_type>] -->

   The parts of the directive in brackets are optional.

   The <relative_path> is an unquoted string with no spaces in it that
   specifies the relative path to the markdown file to be included. The lack
   of spaces in the paths can be easily remedied, but is not expected to be an
   issue for our use case in CEL.

   <fence_type> can be used to indicate that the content of the file should be
   included as a fenced block of the specified type. Note that the type is
   mandatory for the fence specification. If specified, the contents of the
   file will be included in the file as follows:

       ``` <fence_type>
       <contents of the file>
       ```

   Once expanded, the "(<line_count> lines)" portion will be inserted to
   indicate how many lines were added during the expansion. Do not change the
   line count or the included contents.
   '''

   class Include(object):

     def __init__(self, fn, lc, at, ft):
       self.filename = fn
       self.line_count = int(lc) if lc else 0
       self.at = at
       self.fence_type = ft

   dirname = os.path.dirname(fname)
   replacements = []

   for i, l in enumerate(lines):
     if l.startswith('<!-- INCLUDE '):
       m = re.match(
           r'<!-- INCLUDE +(?P<fn>[^ ]*) +(?:\((?P<lc>\d+) lines\) +|)(?:fenced as (?P<ft>\w+) +|)-->',
           l)
       if m is None:
         raise Exception("improperly formatted INCLUDE line: {}".format(l))
       replacements.append(
           Include(m.group('fn'), m.group('lc'), i, m.group('ft')))

   for r in reversed(replacements):
     newlines = []
     with open(os.path.join(dirname, r.filename), 'r') as f:
       newlines = f.readlines()

     for l in newlines:
       if l.startswith('<!-- INCLUDE '):
         raise Exception('''Recursive includes are not supported.''')

     if r.fence_type is not None:
       newlines[0:0] = ['``` {}\n'.format(r.fence_type)]
       newlines.append('```\n')

     lines[r.at] = '<!-- INCLUDE {} ({} lines){} -->\n'.format(
         r.filename, len(newlines),
         ' fenced as {}'.format(r.fence_type) if r.fence_type else '')
     lines[r.at + 1:r.at + 1 + r.line_count] = newlines


 def FixOldStyleLinks(lines, fname):
   '''FixOldStyleLinks replaces links of the form [foo] with [foo][]

   The former style is accepted by Gitiles, but is not valid CommonMark. Hence
   this function replaces it with the equivalent latter form. This replacement
   ensures that the links are correctly handled by editors and viewers other
   than Gitiles.
   '''

   bad_link_re = re.compile(r'(^|[^]])\[(?P<ref>[^]]+)\](?=$|[^[:(])')
   in_pre = False
   for i, l in enumerate(lines):
     if '```' in l:
       in_pre = not in_pre

     if in_pre:
       continue

     if "[TOC]" in l:
       continue

     lines[i] = re.sub(bad_link_re, r'\1[\g<ref>][]', l)


 def FixTrailingWhitespace(lines, fname):
   '''FixTrailingWhitespace does what it says and removes trailing whitespace.
   '''
   trailing_ws_re = re.compile(r'\s+(?=\n)$')
   for i, l in enumerate(lines):
     lines[i] = re.sub(trailing_ws_re, '', l)


 def CheckLinksInContent(lines, fname):
   '''CheckLinksInContent verifies that reference style links are defined in the
   same document.

   Reference style links are links of the form [foo][], or [Foo][foo] where
   [foo] needs to be defiend somewhere else in the document as:

       [foo]: https://example.com/foo

   This function raises an exception with a suitable description if a reference
   style link is not defined.
   '''

   links = set()
   link_re = re.compile(r"\[(?P<ref>[^]]+)\]: ")
   for l in lines:
     m = link_re.match(l)
     if m is None:
       continue
     links.add(m.group('ref'))

   whole_thing = re.sub(r'\s+', ' ', ''.join(lines), count=0)
   whole_thing = re.sub(r'`[^`]*`', '', whole_thing, count=0)
   not_found = set()
   for m in re.finditer(r'\[(?P<a>[^]]*)\]\[(?P<ref>[^]]*)\]', whole_thing):
     ref = m.group('ref') if m.group('ref') != '' else m.group('a')
     if ref not in links:
       not_found.add(ref)

   if len(not_found) != 0:
     raise Exception(
         textwrap.dedent('''\
             The following list of links were unresolved in {}:
                 {}
             '''.format(fname, ','.join(list(not_found)))))


 def FormatMarkdown(fname, dry_run=False):
   '''FormatMarkdown resolves any includes, fixes links, corrects trailing
   whitespace, and verifies that reference style links are defined in Markdown
   document specified by the filename in |fname|.

   if |dry_run| is True, then checks whether the contents in the file at |fname|
   would be modified by the function.

   Returns True if the file at |fname| was modified (or in the case of
   |dry_run==True|, would be modified). False otherwise. A return value of False
   can be safely assumed to mean that the file contents were not modified.

   For additional information about the changes that are applied see the
   documentation in:

       ProcessIncludesInContent()
       FixOldStyleLinks()
       FixTrailingWhitespace()
       CheckLinksInContent()
   '''

   lines = []
   with open(fname, 'r') as f:
     lines = f.readlines()

   for l in lines:
     if "-- Skip validation --" in l:
       logging.info("Skipping %s due to skip validation directive", fname)
       return False

   unmodified = lines[:]

   ProcessIncludesInContent(lines, fname)
   FixOldStyleLinks(lines, fname)
   FixTrailingWhitespace(lines, fname)
   CheckLinksInContent(lines, fname)

   if lines == unmodified:
     logging.info("%s is already correctly formatted", fname)
     return False

   if dry_run:
     print("Would write %s with the following changes:", fname)
     for l in difflib.unified_diff(unmodified, lines, fname + "  (original)",
                                   fname + "  (modified)"):
       print(l, end='')
     print('\n')
   else:
     logging.info("Writing %s", fname)
     with open(fname, 'w') as f:
       f.writelines(lines)

   return True
	#!/usr/bin/env python

	# Copyright 2017 The Chromium Authors. All rights reserved.
	# Use of this source code is governed by a BSD-style license that can be
	# found in the LICENSE file.

	from __future__ import print_function

	import difflib
	import logging
	import os
	import re
	import textwrap


	def ProcessIncludesInContent(lines, fname):
	'''ProcessIncludesInContent expands any INCLUDE directives found in a markdown file.

	Parameters:
	lines : a list of strings with line endings that represent the contents
	of the Markdown file.
	fname : the name of the file including the path if applicable. Includes
	are resolved relative to this path.

	Include directives take the form:

	<!-- INCLUDE <relative_path> [(<line_count> lines)] [fenced as <fence_type>] -->

	The parts of the directive in brackets are optional.

	The <relative_path> is an unquoted string with no spaces in it that
	specifies the relative path to the markdown file to be included. The lack
	of spaces in the paths can be easily remedied, but is not expected to be an
	issue for our use case in CEL.

	<fence_type> can be used to indicate that the content of the file should be
	included as a fenced block of the specified type. Note that the type is
	mandatory for the fence specification. If specified, the contents of the
	file will be included in the file as follows:

	``` <fence_type>
	<contents of the file>
	```

	Once expanded, the "(<line_count> lines)" portion will be inserted to
	indicate how many lines were added during the expansion. Do not change the
	line count or the included contents.
	'''

	class Include(object):

	def __init__(self, fn, lc, at, ft):
	self.filename = fn
	self.line_count = int(lc) if lc else 0
	self.at = at
	self.fence_type = ft

	dirname = os.path.dirname(fname)
	replacements = []

	for i, l in enumerate(lines):
	if l.startswith('<!-- INCLUDE '):
	m = re.match(
	r'<!-- INCLUDE +(?P<fn>[^ ]*) +(?:\((?P<lc>\d+) lines\) +\|)(?:fenced as (?P<ft>\w+) +\|)-->',
	l)
	if m is None:
	raise Exception("improperly formatted INCLUDE line: {}".format(l))
	replacements.append(
	Include(m.group('fn'), m.group('lc'), i, m.group('ft')))

	for r in reversed(replacements):
	newlines = []
	with open(os.path.join(dirname, r.filename), 'r') as f:
	newlines = f.readlines()

	for l in newlines:
	if l.startswith('<!-- INCLUDE '):
	raise Exception('''Recursive includes are not supported.''')

	if r.fence_type is not None:
	newlines[0:0] = ['``` {}\n'.format(r.fence_type)]
	newlines.append('```\n')

	lines[r.at] = '<!-- INCLUDE {} ({} lines){} -->\n'.format(
	r.filename, len(newlines),
	' fenced as {}'.format(r.fence_type) if r.fence_type else '')
	lines[r.at + 1:r.at + 1 + r.line_count] = newlines


	def FixOldStyleLinks(lines, fname):
	'''FixOldStyleLinks replaces links of the form [foo] with [foo][]

	The former style is accepted by Gitiles, but is not valid CommonMark. Hence
	this function replaces it with the equivalent latter form. This replacement
	ensures that the links are correctly handled by editors and viewers other
	than Gitiles.
	'''

	bad_link_re = re.compile(r'(^\|[^]])\[(?P<ref>[^]]+)\](?=$\|[^[:(])')
	in_pre = False
	for i, l in enumerate(lines):
	if '```' in l:
	in_pre = not in_pre

	if in_pre:
	continue

	if "[TOC]" in l:
	continue

	lines[i] = re.sub(bad_link_re, r'\1[\g<ref>][]', l)


	def FixTrailingWhitespace(lines, fname):
	'''FixTrailingWhitespace does what it says and removes trailing whitespace.
	'''
	trailing_ws_re = re.compile(r'\s+(?=\n)$')
	for i, l in enumerate(lines):
	lines[i] = re.sub(trailing_ws_re, '', l)


	def CheckLinksInContent(lines, fname):
	'''CheckLinksInContent verifies that reference style links are defined in the
	same document.

	Reference style links are links of the form [foo][], or [Foo][foo] where
	[foo] needs to be defiend somewhere else in the document as:

	[foo]: https://example.com/foo

	This function raises an exception with a suitable description if a reference
	style link is not defined.
	'''

	links = set()
	link_re = re.compile(r"\[(?P<ref>[^]]+)\]: ")
	for l in lines:
	m = link_re.match(l)
	if m is None:
	continue
	links.add(m.group('ref'))

	whole_thing = re.sub(r'\s+', ' ', ''.join(lines), count=0)
	whole_thing = re.sub(r'`[^`]*`', '', whole_thing, count=0)
	not_found = set()
	for m in re.finditer(r'\[(?P<a>[^]])\]\[(?P<ref>[^]])\]', whole_thing):
	ref = m.group('ref') if m.group('ref') != '' else m.group('a')
	if ref not in links:
	not_found.add(ref)

	if len(not_found) != 0:
	raise Exception(
	textwrap.dedent('''\
	The following list of links were unresolved in {}:
	{}
	'''.format(fname, ','.join(list(not_found)))))


	def FormatMarkdown(fname, dry_run=False):
	'''FormatMarkdown resolves any includes, fixes links, corrects trailing
	whitespace, and verifies that reference style links are defined in Markdown
	document specified by the filename in \|fname\|.

	if \|dry_run\| is True, then checks whether the contents in the file at \|fname\|
	would be modified by the function.

	Returns True if the file at \|fname\| was modified (or in the case of
	\|dry_run==True\|, would be modified). False otherwise. A return value of False
	can be safely assumed to mean that the file contents were not modified.

	For additional information about the changes that are applied see the
	documentation in:

	ProcessIncludesInContent()
	FixOldStyleLinks()
	FixTrailingWhitespace()
	CheckLinksInContent()
	'''

	lines = []
	with open(fname, 'r') as f:
	lines = f.readlines()

	for l in lines:
	if "-- Skip validation --" in l:
	logging.info("Skipping %s due to skip validation directive", fname)
	return False

	unmodified = lines[:]

	ProcessIncludesInContent(lines, fname)
	FixOldStyleLinks(lines, fname)
	FixTrailingWhitespace(lines, fname)
	CheckLinksInContent(lines, fname)

	if lines == unmodified:
	logging.info("%s is already correctly formatted", fname)
	return False

	if dry_run:
	print("Would write %s with the following changes:", fname)
	for l in difflib.unified_diff(unmodified, lines, fname + " (original)",
	fname + " (modified)"):
	print(l, end='')
	print('\n')
	else:
	logging.info("Writing %s", fname)
	with open(fname, 'w') as f:
	f.writelines(lines)

	return True