add real html2markdown.py code

commit: 8d7c9e4f009861c707468a3b8513e039e522005d [log] [tgz]
author: Dirk Pranke <dpranke@google.com> Thu Oct 14 00:32:32 2021
committer: Dirk Pranke <dpranke@google.com> Thu Oct 14 00:32:32 2021
tree: 301860a359b00e80e2a693daa02c793a7100f122
parent: 20d99f3155c20776eb26b9f5bb3eef205e19b178 [diff]
diff --git a/scripts/html2markdown.py b/scripts/html2markdown.py
index 367d30d..699b12a 100644
--- a/scripts/html2markdown.py
+++ b/scripts/html2markdown.py

@@ -14,8 +14,1176 @@
 
 """HTML to Markdown renderer."""
 
+import os
+import re
+import io
+import textwrap
+import urllib
+import xml.sax
 
-def Convert(input_stream, output_stream, url_translator):
+
+class _Flags:
+  # Whether to render h1s and h2s with underlined - and =.
+  underline_headers = False
+
+  # The set of characters to escape with \'\\\' in the
+  # Markdown. This is not the set of all special Markdown
+  # characters, but rather those characters that tend to
+  # get misinterpreted as Markdown syntax the most. Blindly
+  # escaping all special Markdown characters results in ugly
+  # Markdown.
+  escape_chars = r'\`*[]'
+
+  # Format for italic tags.
+  italic_format = '*'
+
+  # Format for bold tags.
+  bold_format = '**'
+
+  # Format for strikethrough tags.
+  strike_format = '~~'
+
+  # Format for underline tags.
+  highlight_format = '=='
+
+  # Number of spaces to indent an unordered list.
+  # This total includes the bullet.
+  # For example, a value of 4 yields '*   '
+  unordered_list_indent = 4
+
+  # Number of spaces to indent an ordered list.
+  # This total includes the number.
+  # For example, a value of 4 yields '1.  '
+  ordered_list_indent = 4
+
+  # The DIV blocks that should be formatted as code.
+  code_class_regex = r'^sites-codeblock sites-codesnippet-block$'
+
+  # The class of DIV blocks used for table of contents.
+  toc_class_regex = r'^sites-embed-content sites-embed-type-toc$'
+
+  # The class of DIV blocks that should be ignored.
+  ignore_class_regex = r''
+
+  # The style of DIV blocks that should be ignored.
+  ignore_style_regex = r'^display:none;$'
+
+  # Format text blocks to the given line width. Set to zero
+  # to disable line wrapping.
+  line_width = 80
+
+  # Whether to use indented code blocks, if False use fenced.
+  indented_code_blocks = False
+
+  # Whether to use HTML code blocks instead of fenced code
+  # blocks if source code block includes formatted text.
+  allow_html_code_blocks = True
+
+  # Links that are automatically recognized by the renderer.
+  shortlinks_regex = r'^http://(ag|b|cl|g|go|who)/'
+
+  # Print the fragment tree for debugging.
+  debug_print_tree = False
+
+
+FLAGS = _Flags()
+
+
+def _EscapeText(text, reserved_chars):
+  """Escapes any reserved characters with a backslash.
+
+  Args:
+    text: The string to escape.
+    reserved_chars: A string of reserved characters that need to be escaped.
+
+  Returns:
+    The escaped text.
+  """
+  markdown = io.StringIO()
+  for c in text:
+    if c in reserved_chars:
+      markdown.write('\\')
+    markdown.write(c)
+  return markdown.getvalue()
+
+
+def _EscapeContentForHtml(text):
+  result = io.StringIO()
+  escapes = {'<': '&lt;', '>': '&gt;'}
+  for c in text:
+    result.write(c if c not in escapes else escapes[c])
+  return result
+
+
+ENCODED_NEWLINE = '&#%d;' % ord('\n')
+
+
+def _RestoreEncodedNewlines(text):
+  return text.replace(ENCODED_NEWLINE, '\n')
+
+
+def _WrapLine(line, indent):
+  """Wraps the line to fit into the column limit.
+
+  Args:
+    line: The string to wrap.
+    indent: An integer with the number of columns of indentation.
+
+  Returns:
+    The wrapped text.
+  """
+  if FLAGS.line_width > 0:
+    return ('\n' + ' ' * indent).join(textwrap.wrap(
+        line,
+        width=FLAGS.line_width - indent,
+        break_long_words=False,
+        break_on_hyphens=False))
+  return line
+
+
+class Fragment:
+  """Base class for all output fragments.
+
+  To generate a line of output, the methods will be called in the following
+  order:
+
+  WriteIndent()
+  WriteContentIntoParentAndClear()
+  ConsumeContent() -- for the topmost fragment only
+  StripLine()
+  WrapLine()
+  """
+
+  def __init__(self, indent, prefix, suffix):
+    self._content = io.StringIO()
+    self._indent = indent
+    self._prefix = prefix
+    self._suffix = suffix
+    self._parent = None
+    self._children = []
+
+  def __repr__(self):
+    debug_print = lambda text: text.encode('utf-8') if text else ''
+    return ('{' +
+            self.__class__.__name__ +
+            ': indent=' + debug_print(self._indent) +
+            '; prefix=' + debug_print(self._prefix) +
+            '; content=' + debug_print(self._content.getvalue()) +
+            '; suffix=' + debug_print(self._suffix) +
+            '}')
+
+  def SetParent(self, parent):
+    self._parent = parent
+
+  def AddChild(self, node):
+    self._children.append(node)
+    node.SetParent(self)
+    return node
+
+  def GetChildren(self):
+    return self._children
+
+  def _AllChildren(self):
+    all_children = []
+    def Traverse(fragment):
+      for c in fragment.GetChildren():
+        all_children.append(c)
+        Traverse(c)
+    Traverse(self)
+    return all_children
+
+  def WriteIndent(self, output):
+    if self._indent:
+      output.write(self._indent)
+
+  def WriteContentIntoParentAndClear(self):
+    self._WriteContent(self._parent._content)  # pylint: disable=protected-access
+    self._ClearContent()
+    self._children = []
+
+  def _WriteContent(self, output):
+    """Implementation of content rendering. Can be overridden in subclasses."""
+    self._Write(output, self._prefix, self._content.getvalue(), self._suffix)
+
+  def _Write(self, output, prefix, content, suffix):
+    """Default implementation of content rendering for reuse by subclasses."""
+    has_content = bool(content.strip())
+    if prefix and has_content:
+      output.write(prefix)
+    output.write(content)
+    if suffix and has_content:
+      output.write(suffix)
+
+  def UnsetSuffix(self):
+    self._suffix = ''
+
+  def UnsetPrefix(self):
+    self._prefix = ''
+
+  def _UpdatePrefixAndSuffix(self, prefix, suffix):
+    if self._prefix:
+      self._prefix = prefix
+    if self._suffix:
+      self._suffix = suffix
+
+  def _ClearContent(self):
+    """Clears the content. This will only be called after it's been written."""
+    self._content = io.StringIO()
+
+  def ConsumeContent(self):
+    content = self._content
+    self._ClearContent()
+    return content
+
+  def Append(self, text):
+    """Appends text.
+
+    Args:
+      text: The string to append, it will be escaped.
+    """
+    assert isinstance(text, str)
+    self._content.write(self.EscapeText(text))
+
+  def EscapeText(self, text):
+    """Escapes any reserved characters when Append() is called with text.
+
+    By default this defers to the parent fragment.
+
+    Args:
+      text: The string to escape.
+
+    Returns:
+      The escaped string.
+    """
+    if self._parent:
+      return self._parent.EscapeText(text)
+    return text
+
+  def StripLine(self, text):
+    """Does any needed stripping of whitespace.
+
+    Some blocks (code for example) will want to preserve whitespace, while
+    others will want to coalesce it together. By default this defers to the
+    parent fragment.
+
+    Args:
+      text: The string to strip
+
+    Returns:
+      The stripped string.
+    """
+    if self._parent:
+      return self._parent.StripLine(text)
+    return text
+
+  def WrapLine(self, line, indent):
+    """Wraps the line to fit into the column limit, if necessary.
+
+    Most blocks (code for example) will want to preserve whitespace and won't
+    break their output.
+
+    Args:
+      text: The string to wrap.
+      indent: Indent string.
+    Returns:
+      The wrapped string.
+    """
+    del indent
+    return line
+
+  def NeedsToMergeWith(self, text):
+    del text
+    return False
+
+
+class Text(Fragment):
+  """Markdown fragment that consists of just a string."""
+
+  def __init__(self, indent=None, prefix=None, suffix=None):
+    super().__init__(indent, prefix, suffix)
+
+
+class IgnoreBlock(Fragment):
+  """Markdown fragment that omits all content."""
+
+  def __init__(self):
+    super().__init__(None, None, None)
+
+
+class TextBlock(Text):
+  """A TextBlock coalesces all spaces and escapes all reserved chars."""
+
+  def EscapeText(self, text):
+    return _EscapeText(text, FLAGS.escape_chars)
+
+  def StripLine(self, text):
+    # Treat newlines as spaces and then coalesce spaces.
+    text = text.replace('\n', ' ')
+    # Replace all Unicode nonbreaking spaces with simple spaces. This is safer
+    # than deletion since spaces are coalesced below anyway.
+    text = text.replace(chr(160), ' ')
+
+    return re.sub(r' +', ' ', text.strip())
+
+
+class WrappedTextBlock(TextBlock):
+  """A WrappedTextBlock wraps the output lines to fit into the column limit."""
+
+  def WrapLine(self, line, indent):
+    return _WrapLine(line, len(indent))
+
+
+class BlockquoteBlock(WrappedTextBlock):
+  """A BlockquoteBlock wraps content and prepends each line with '> '.
+
+  The generator must emit BlockquoteBlocks with no indent for paragraphs
+  inside a blockquote. This will allow propagating the final call to WrapLine
+  up to the outermost BlockquoteBlock which will wrap the lines and prepend
+  each of them with the indent.
+  """
+
+  def __init__(self, indent='> '):
+    super().__init__(indent, None, None)
+
+  def WrapLine(self, line, indent):
+    if not self._indent and self._parent:
+      return self._parent.WrapLine(line, indent)
+    wrapped = _WrapLine(line, len(indent))
+    lines = wrapped.splitlines(True)
+    return indent.join([l.lstrip() for l in lines])
+
+
+class CodeBlock(Text):
+  """Base class for different code block fragment implementations."""
+
+  def EscapeText(self, text):
+    return text
+
+  def StripLine(self, text):
+    # Completely ignore newlines in code blocks. Sites always uses <br/>.
+    return text.replace('\n', '')
+
+  def ChangeToHtml(self):
+    content = self._content.getvalue()
+    if content:
+      self._content = _EscapeContentForHtml(content)
+
+
+class IndentedCodeBlock(CodeBlock):
+  """A IndentedCodeBlock indents by four spaces."""
+
+  def __init__(self, indent='    '):
+    super().__init__(indent, None, None)
+
+
+class FencedCodeBlock(CodeBlock):
+  """A FencedCodeBlock is fenced with triple backticks (```).
+
+  To render correctly, content writing must not happen
+  unless the end of the source code block has been encountered.
+  That is, the entire code block from the source HTML must
+  be rendered in a single write pass.
+  """
+
+  def __init__(self, indent=None,
+               prefix='```none' + ENCODED_NEWLINE,
+               suffix=ENCODED_NEWLINE + '```'):
+    super().__init__(indent, prefix, suffix)
+
+  def WriteIndent(self, output):
+    # Adjust inner fragments and self before rendering.
+    if FLAGS.allow_html_code_blocks:
+      has_formatted_text = False
+      for c in self._AllChildren():
+        if isinstance(c, FormattedText):
+          c.ChangeToHtml()
+          has_formatted_text = True
+      if has_formatted_text:
+        for c in self._AllChildren():
+          if isinstance(c, CodeBlock):
+            c.ChangeToHtml()
+        self._UpdatePrefixAndSuffix(
+            '<pre><code>', ENCODED_NEWLINE + '</code></pre>')
+    super().WriteIndent(output)
+
+  def StripLine(self, text):
+    text = super().StripLine(text)
+    lines = _RestoreEncodedNewlines(text).splitlines()
+    return '\n'.join([l for l in lines if l])
+
+  def WrapLine(self, line, indent):
+    lines = line.splitlines(True)
+    return indent.join(lines)
+
+
+class FencedCodeBlockLine(Text):
+  """A line of code inside FencedCodeBlock."""
+
+  def __init__(self, indent=None,
+               prefix=ENCODED_NEWLINE, suffix=ENCODED_NEWLINE):
+    super().__init__(indent, prefix, suffix)
+
+  def StripLine(self, text):
+    text = super().StripLine(text)
+    return _RestoreEncodedNewlines(text)
+
+
+class UnderlinedHeader(TextBlock):
+  """Markdown fragment for an underlined section header."""
+
+  def __init__(self, char):
+    super().__init__()
+    self._char = char
+
+  def _WriteContent(self, output):
+    length = len(self.StripLine(self._content.getvalue()))
+    if length > 0:
+      # '\n' will be stripped, so use an encoded '\n' that we can later replace
+      # after the line is stripped.
+      self._Write(output,
+                  None,
+                  self._content.getvalue(),
+                  ENCODED_NEWLINE + self._char * length)
+
+  def StripLine(self, text):
+    text = super().StripLine(text)
+    return _RestoreEncodedNewlines(text)
+
+
+class FormattedText(Text):
+  """Text wrapped in Markdown formatting."""
+
+  def __init__(self, fmt):
+    super().__init__(None, fmt, fmt)
+
+  def _Pad(self, bigger, smaller):
+    return ' ' * (len(bigger) - len(smaller))
+
+  def _WriteContent(self, output):
+    prefix = self._prefix
+    content = self._content.getvalue()
+    suffix = self._suffix
+    if prefix:
+      # If there are whitespaces immediately after the prefix,
+      # they must be pushed out before the prefix.
+      lstripped = content.lstrip()
+      if len(content) > len(lstripped):
+        prefix = self._Pad(content, lstripped) + prefix
+        content = lstripped
+    if suffix:
+      # If there are whitespaces immediately before the suffix,
+      # they must be pushed out after the suffix.
+      rstripped = content.rstrip()
+      if len(content) > len(rstripped):
+        suffix = suffix + self._Pad(content, rstripped)
+        content = rstripped
+    self._Write(output, prefix, content, suffix)
+
+  def ChangeToHtml(self):
+    content = self._content.getvalue()
+    if content:
+      content = _EscapeContentForHtml(content)
+
+
+class BoldFormattedText(FormattedText):
+  """Text formatted as bold."""
+
+  def __init__(self):
+    super().__init__(FLAGS.bold_format)
+
+  def NeedsToMergeWith(self, text):
+    return isinstance(text, BoldFormattedText)
+
+  def ChangeToHtml(self):
+    super().ChangeToHtml()
+    self._UpdatePrefixAndSuffix('<b>', '</b>')
+
+
+class ItalicFormattedText(FormattedText):
+  """Text formatted as italic."""
+
+  def __init__(self):
+    super().__init__(FLAGS.italic_format)
+
+  def NeedsToMergeWith(self, text):
+    return isinstance(text, ItalicFormattedText)
+
+  def ChangeToHtml(self):
+    super().ChangeToHtml()
+    self._UpdatePrefixAndSuffix('<i>', '</i>')
+
+
+class StrikeThroughFormattedText(FormattedText):
+  """Text formatted as strike through."""
+
+  def __init__(self):
+    super().__init__(FLAGS.strike_format)
+
+  def NeedsToMergeWith(self, text):
+    return isinstance(text, StrikeThroughFormattedText)
+
+  def ChangeToHtml(self):
+    super().ChangeToHtml()
+    self._UpdatePrefixAndSuffix('<s>', '</s>')
+
+
+class HighlightFormattedText(FormattedText):
+  """Highlighted text."""
+
+  def __init__(self):
+    super().__init__(FLAGS.highlight_format)
+
+  def NeedsToMergeWith(self, text):
+    return isinstance(text, HighlightFormattedText)
+
+  def ChangeToHtml(self):
+    super().ChangeToHtml()
+    self._UpdatePrefixAndSuffix('<u>', '</u>')
+
+
+class ListItem(Text):
+  """Item in a list."""
+
+  def __init__(self, bullet):
+    super().__init__()
+    self._bullet = bullet
+
+  def WriteIndent(self, output):
+    if self._bullet:
+      # TODO(dpranke): The original code relied on strings and bytes
+      # being interchangeable in Python2, so you could seek backwards
+      # from the current location with a relative offset. You can't
+      # do that in Python3, apparently.
+      #
+      # To get around this for the moment, instead of seeking backwards
+      # 4 characters, we embed 4 '\b' backspaces, and then have the client
+      # do a global search and replace of '    \b\b\b\b' with '' instead.
+      #
+      # This is awkward, so we should rework this so that this isn't needed.
+      #
+      # output.seek(-len(self._bullet), os.SEEK_CUR)
+      output.write('\b' * len(self._bullet))
+      output.write(self._bullet)
+    super().WriteIndent(output)
+
+  def _ClearContent(self):
+    self._bullet = None
+    super()._ClearContent()
+
+  def WrapLine(self, line, indent):
+    return _WrapLine(line, len(indent))
+
+
+class Link(Text):
+  """Markdown link."""
+
+  def __init__(self, href):
+    super().__init__()
+    self._href = href
+    self._url_opener_prefix = ''
+    self._url_opener_suffix = ''
+
+  def MakeAnImage(self, width, height):
+    self._url_opener_prefix = '!'
+    if width and height:
+      self._url_opener_suffix = (
+          '{{width="{}" height="{}"}}'.format(width, height))
+
+  def _IsShortLink(self, text):
+    if FLAGS.shortlinks_regex and (
+        re.compile(FLAGS.shortlinks_regex).match(self._href)):
+      parsed_href = urllib.parse.urlsplit(self._href)
+      if parsed_href.netloc + parsed_href.path == text:
+        return True
+    return None
+
+  def _WriteLink(self, output, text):
+    write_short_link = (not (self._url_opener_prefix or self._url_opener_suffix)
+                        and self._IsShortLink(text))
+    if write_short_link:
+      self._Write(output, None, text, None)
+    else:
+      self._Write(output,
+                  self._url_opener_prefix + '[',
+                  text,
+                  '](' + self._href + ')' + self._url_opener_suffix)
+
+  def _WriteContent(self, output):
+    text = self._content.getvalue()
+    if text:
+      if text.startswith('http://') or text.startswith('https://'):
+        self._Write(output, '<', text, '>')
+      else:
+        self._WriteLink(output, text)
+
+
+class Image(Text):
+  """Image."""
+
+  def __init__(self, src, alt, width, height):
+    super().__init__()
+    self._src = src
+    self._alt = alt or 'image'
+    self._width = width
+    self._height = height
+
+  def _WriteContent(self, output):
+    self._Write(output, '![', self._alt + '](' + self._src, ')')
+
+
+class Code(Text):
+  """Inline code."""
+
+  def __init__(self):
+    super().__init__(None, '`', '`')
+
+  def EscapeText(self, text):
+    return text
+
+  def _WriteContent(self, output):
+    prefix = self._prefix
+    content = self._content.getvalue()
+    suffix = self._suffix
+    if '`' in content:
+      # If a backtick (`) is present inside inline code, the fragment
+      # must use double backticks.
+      prefix = suffix = '``'
+      # Since having content starting or ending with a backtick would emit
+      # triple backticks which designates a fenced code fragment, pad content
+      # to avoid this.
+      if content.startswith('`'):
+        content = ' ' + content
+      if content.endswith('`'):
+        content += ' '
+    self._Write(output, prefix, content, suffix)
+
+  def NeedsToMergeWith(self, text):
+    return isinstance(text, Code)
+
+
+class EmbeddedContent(Text):
+  """Embedded content: Docs, Drawings, Presentations, etc."""
+
+  def __init__(self, href, width, height):
+    super().__init__()
+    self._href = href
+    self._width = width
+    self._height = height
+
+  def _WriteContent(self, output):
+    parsed_href = urllib.parse.urlsplit(self._href)
+    if parsed_href.scheme == 'http':
+      parsed_href = urllib.parse.SplitResult(
+          'https', parsed_href.netloc, parsed_href.path, parsed_href.query,
+          parsed_href.fragment)
+    # Note: 'allow="fullscreen"' is requested for all content for simplicity.
+    # g3doc server has dedicated logic to deal with these requests.
+    element = '<iframe src="{}"{} allow="fullscreen" />'.format(
+        urllib.parse.urlunsplit(parsed_href),
+        (' width="{}" height="{}"'.format(self._width, self._height) if (
+            self._width and self._height) else ''))
+    self._Write(output, None, element, None)
+
+
+class ListInfo:
+
+  def __init__(self, tag):
+    self.tag = tag       # The tag used to start the list
+    self.item_count = 0  # The number of items in the list
+
+
+class FragmentTree:
+  """Class for managing a tree of fragments.
+
+  There is a "scope" formed by nested fragments, e.g.
+  italic fragment inside bold fragment inside paragraph.
+  The scope is stored in the stack. For convenience,
+  the stack always have one element.
+
+  Fragments popped out from the scope may be re-added
+  back into the tree as children of the last fragment.
+  This allows "chaining" of structured content for future
+  processing. For example, if there were several bold
+  fragments inside a paragraph interleaved with fragments
+  of regular text, all these fragments will end up as
+  children of the paragraph fragment.
+
+  """
+
+  def __init__(self, top_fragment):
+    self._stack = [top_fragment]
+
+  def ActiveFragmentScopeDepth(self):
+    return len(self._stack) - 1
+
+  def StartFragment(self, fragment):
+    fragment.SetParent(self._stack[-1])
+    self._stack.append(fragment)
+    return fragment
+
+  def EndFragment(self):
+    return self._stack.pop()
+
+  def AppendFragment(self, fragment):
+    return self._stack[-1].AddChild(fragment)
+
+  def _ApplyRecursivelyToNode(self, node, scope_operation, operation,  # pylint: disable=missing-docstring
+                              debug_indent):
+    if not debug_indent:
+      for child in node.GetChildren():
+        self._ApplyRecursivelyToNode(child, scope_operation, operation, None)
+    else:
+      debug_indent += '  c '
+      for child in node.GetChildren():
+        print(debug_indent + repr(child))
+        self._ApplyRecursivelyToNode(child, scope_operation, operation,
+                                     debug_indent)
+    operation(node)
+
+  def _ApplyRecursivelyToScope(self, nodes, scope_operation, operation,  # pylint: disable=missing-docstring
+                               debug_indent):
+    node = nodes.pop()
+    scope_operation(node)
+    if debug_indent:
+      print(debug_indent + repr(node))
+    if nodes:
+      self._ApplyRecursivelyToScope(nodes, scope_operation, operation,
+                                    (debug_indent + '  s ' if debug_indent
+                                     else None))
+    self._ApplyRecursivelyToNode(node, scope_operation, operation,
+                                 debug_indent)
+
+  def ApplyToAllFragments(self, scope_operation, operation):
+    """Recursively applies operations to all fragments in the tree.
+
+    The omnipresent topmost fragment is excluded. The 'scope_operation'
+    is applied to every element in the fragment stack in pre-order.
+    The 'operation' is applied to all fragments in the tree in post-order.
+
+    Args:
+      scope_operation: The operation to apply to fragments in the scope stack.
+      operation: The operation to apply to all fragments in the tree.
+    """
+    self._ApplyRecursivelyToScope(list(reversed(self._stack[1:])),
+                                  scope_operation, operation,
+                                  '  ' if FLAGS.debug_print_tree else None)
+
+  def FindFirstFragmentFromEnd(self, predicate, steps_from_last=0):
+    sub_stack = self._stack[:-steps_from_last if steps_from_last else None]
+    return next((node for node in sub_stack if predicate(node)), None)
+
+  def PeekFragmentFromStart(self, steps_from_first=0):
+    return self._stack[steps_from_first]
+
+  def PeekFragmentFromEnd(self, steps_from_last=0):
+    return self._stack[-(steps_from_last + 1)]
+
+  def PeekLastAppendedFragment(self):
+    return (self._stack[-1].GetChildren()[-1]
+            if self._stack[-1].GetChildren() else None)
+
+
+class MarkdownGenerator:
+  """Generates Markdown based on the series of HTML tags seen.
+
+  Each time an opening HTML tag is seen, the appropriate markdown fragment is
+  created and pushed onto a stack. Any text encountered is appended to the
+  fragment at the top of the stack. When a closing HTML tag is seen, the stack
+  is popped and the fragment removed is appended to the new top of the stack.
+
+  Markdown is buffered in the fragment stack until an entire line has been
+  formed, at which point _WriteFragmentsAsLine() is called to write it out. The
+  content buffered in the stack is cleared, but otherwise the stack remains
+  unmodified.
+  """
+
+  def __init__(self, out, url_translator):
+    self._out = out
+    self._url_translator = url_translator
+    self._fragment_tree = FragmentTree(Text())
+    self._list_info_stack = []
+    self._pending_newlines = 0
+    # Initialize the regexps to match nothing (rather than be None).
+    self._code_class_regex = re.compile(FLAGS.code_class_regex or 'a^')
+    self._toc_class_regex = re.compile(FLAGS.toc_class_regex or 'a^')
+    self._ignore_class_regex = re.compile(FLAGS.ignore_class_regex or 'a^')
+    self._ignore_style_regex = re.compile(FLAGS.ignore_style_regex or 'a^')
+
+  def _Push(self, fragment):
+    """Sets the parent fragment and pushes it onto the fragment stack.
+
+    In the case where there is an IgnoreBlock on the stack, a new IgnoreBlock
+    is pushed instead.
+
+    Args:
+      fragment: The Fragment object to push on the stack.
+    """
+    if isinstance(self._fragment_tree.PeekFragmentFromEnd(), IgnoreBlock):
+      # If the top of the stack is IgnoreBlock, push an IgnoreBlock instead.
+      fragment = IgnoreBlock()
+    else:
+      # Check if we need to merge adjacent formatting, e.g.
+      # instead of **bold****bold** we need to write **boldbold**,
+      # as the former is not correct Markdown syntax.
+      last_appended = self._fragment_tree.PeekLastAppendedFragment()
+      if last_appended and last_appended.NeedsToMergeWith(fragment):
+        last_appended.UnsetSuffix()
+        fragment.UnsetPrefix()
+
+    self._fragment_tree.StartFragment(fragment)
+
+  def _Pop(self):
+    """Pops the fragment stack it to the new top of stack.
+
+    If the fragment stack would be empty after popping, then the fragment is
+    written to the output first.
+    """
+    if self._fragment_tree.ActiveFragmentScopeDepth() > 1:
+      fragment = self._fragment_tree.EndFragment()
+      self._fragment_tree.AppendFragment(fragment)
+    else:
+      self._WriteFragmentsAsLine(newlines=0)
+      self._fragment_tree.EndFragment()
+
+  def _IsWithinFragmentType(self, fragment_type, steps_from_last=0):
+    return self._fragment_tree.FindFirstFragmentFromEnd(
+        lambda fragment: isinstance(fragment, fragment_type),
+        steps_from_last) is not None
+
+  def Break(self):
+    if not self._IsWithinFragmentType(FencedCodeBlock):
+      self._WriteFragmentsAsLine(newlines=1)
+    else:
+      fragment = FencedCodeBlockLine(prefix='', suffix='')
+      self._Push(fragment)
+      fragment.Append(ENCODED_NEWLINE)
+      self._Pop()
+
+  def HorizontalRule(self):
+    # Horizontal rule must be preceded and followed by a blank line
+    self._AddVerticallyPaddedParagraph('---')
+
+  def StartDocument(self):
+    self._Push(WrappedTextBlock())
+
+  def EndDocument(self):
+    self._Pop()
+
+  def StartParagraph(self):
+    self._WriteFragmentsAsLine(newlines=2)
+
+  def EndParagraph(self):
+    self._WriteFragmentsAsLine(newlines=2)
+
+  def StartDiv(self, cls, style):
+    """Process opening of a div element.
+
+    Args:
+      cls: The class attribute of the element.
+      style: The style attribute of the element.
+    """
+    if not self._IsWithinFragmentType(FencedCodeBlock):
+      if self._IsWithinFragmentType(CodeBlock):
+        self._WriteFragmentsAsLine(newlines=1)
+      else:
+        self._WriteFragmentsAsLine(newlines=2)
+
+    if ((cls and self._ignore_class_regex.match(cls)) or
+        style and self._ignore_style_regex.match(style)):
+      self._Push(IgnoreBlock())
+    elif self._IsWithinFragmentType(FencedCodeBlock):
+      self._Push(FencedCodeBlockLine())
+    elif self._IsWithinFragmentType(CodeBlock):
+      self._Push(CodeBlock())
+    elif self._IsWithinFragmentType(BlockquoteBlock):
+      self._Push(BlockquoteBlock(indent=None))
+    elif cls and self._toc_class_regex.match(cls):
+      self._AddTableOfContents()
+      self._Push(IgnoreBlock())  # Ignore the items inside the Sites TOC
+    elif cls and self._code_class_regex.match(cls):
+      if FLAGS.indented_code_blocks:
+        self._Push(IndentedCodeBlock())
+      else:
+        self._Push(FencedCodeBlock())
+    else:
+      self._Push(WrappedTextBlock())
+
+  def EndDiv(self):
+    if not self._IsWithinFragmentType(FencedCodeBlock, steps_from_last=1):
+      if self._IsWithinFragmentType(CodeBlock, steps_from_last=1):
+        self._WriteFragmentsAsLine(newlines=1)
+      else:
+        self._WriteFragmentsAsLine(newlines=2)
+    self._Pop()
+
+  def StartHeader(self, level):
+    self._WriteFragmentsAsLine(newlines=2)
+    if level == 1 and FLAGS.underline_headers:
+      self._Push(UnderlinedHeader('='))
+    elif level == 2 and FLAGS.underline_headers:
+      self._Push(UnderlinedHeader('-'))
+    else:
+      self._Push(TextBlock(prefix=('#' * level) + ' '))
+
+  def EndHeader(self):
+    self._WriteFragmentsAsLine(newlines=2)
+    self._Pop()
+
+  def StartList(self, tag):
+    if not self._list_info_stack:
+      self._WriteFragmentsAsLine(newlines=2)
+    else:
+      self._WriteFragmentsAsLine(newlines=1)
+    self._list_info_stack.append(ListInfo(tag))
+    if tag == 'ol':
+      self._Push(Text(' ' * FLAGS.ordered_list_indent))
+    else:
+      self._Push(Text(' ' * FLAGS.unordered_list_indent))
+
+  def EndList(self):
+    self._list_info_stack.pop()
+    if not self._list_info_stack:
+      self._WriteFragmentsAsLine(newlines=2)
+    else:
+      self._WriteFragmentsAsLine(newlines=1)
+    self._Pop()
+
+  def StartListItem(self):
+    self._WriteFragmentsAsLine(newlines=1)
+    # Google Sites sometimes spits out pages with <li> tags not enclosed within
+    # an <ol> or <ul> tag.
+    tag = ''
+    if self._list_info_stack:
+      self._list_info_stack[-1].item_count += 1
+      tag = self._list_info_stack[-1].tag
+    if tag == 'ol':
+      item_count = self._list_info_stack[-1].item_count
+      # string.ljust makes room for as many digits as you need.
+      prefix = ('%d.' % item_count).ljust(FLAGS.ordered_list_indent)
+      self._Push(ListItem(prefix))
+    else:
+      prefix = '*'.ljust(FLAGS.unordered_list_indent)
+      self._Push(ListItem(prefix))
+
+  def EndListItem(self):
+    self._WriteFragmentsAsLine(newlines=1)
+    self._Pop()
+
+  def StartFormat(self, tag):
+    # Allowed formatting depends on the surrounding fragment type.
+    if not self._IsWithinFragmentType(IndentedCodeBlock):
+      formats_map = {
+          'i': ItalicFormattedText,
+          'em': ItalicFormattedText,
+          'b': BoldFormattedText,
+          'strong': BoldFormattedText,
+          'strike': StrikeThroughFormattedText,
+          's': StrikeThroughFormattedText,
+          'del': StrikeThroughFormattedText,
+          'u': HighlightFormattedText,
+          'code': Code,
+          None: Text,
+      }
+      if self._IsWithinFragmentType(FencedCodeBlock):
+        if FLAGS.allow_html_code_blocks:
+          # HTML code block can render formats but must not use Code fragments.
+          formats_map['code'] = formats_map[None] = CodeBlock
+        else:
+          formats_map = {None: CodeBlock}
+    else:
+      # Inside an indented code block no formatting is allowed.
+      formats_map = {None: CodeBlock}
+    self._Push(formats_map[tag]() if tag in formats_map
+               else formats_map[None]())
+
+  def EndFormat(self):
+    self._Pop()
+
+  def StartAnchor(self, href):
+    if href is not None:
+      href = self._url_translator.Translate(href)
+      self._Push(Link(href))
+    else:
+      self._Push(Text())
+
+  def EndAnchor(self):
+    self._Pop()
+
+  def StartBlockquote(self):
+    if not self._IsWithinFragmentType(CodeBlock):
+      self._WriteFragmentsAsLine(newlines=1)
+      self._Push(BlockquoteBlock())
+    else:
+      self._Push(Text())
+
+  def EndBlockquote(self):
+    if not self._IsWithinFragmentType(CodeBlock):
+      self._WriteFragmentsAsLine(newlines=2)
+    self._Pop()
+
+  def Image(self, src, alt, width, height):
+    src = self._url_translator.Translate(src)
+    self._fragment_tree.AppendFragment(Image(src, alt, width, height))
+
+  def Iframe(self, src, width, height):
+    """Process an <iframe> element.
+
+    Sites use <iframe> for embedded content: Docs, Drawings, etc.
+    g3doc implements this by supporting <iframe> HTML tag directly.
+
+    Args:
+      src: Source URL.
+      width: Element width.
+      height: Element height.
+    """
+    if False:
+      # TODO(dpranke): Figure out if we should support embedded IFRAME tags.
+      # For now, we skip over them.
+      self._WriteFragmentsAsLine(newlines=2)
+      self._Push(EmbeddedContent(src, width, height))
+      self._Pop()
+
+  def Text(self, text):
+    if not isinstance(self._fragment_tree.PeekFragmentFromEnd(), IgnoreBlock):
+      fragment = (CodeBlock() if self._IsWithinFragmentType(CodeBlock)
+                  else Text())
+      self._fragment_tree.AppendFragment(fragment)
+      fragment.Append(text)
+
+  def _AddTableOfContents(self):
+    # TOC must be preceded and followed by a blank line
+    self._AddVerticallyPaddedParagraph('[TOC]')
+
+  def _AddVerticallyPaddedParagraph(self, text):
+    self._WriteFragmentsAsLine(newlines=2)
+    fragment = CodeBlock()  # Use CodeBlock to prevent escaping
+    self._fragment_tree.AppendFragment(fragment)
+    fragment.Append(text)
+    self._WriteFragmentsAsLine(newlines=2)
+
+  def _WriteFragmentsAsLine(self, newlines):
+    """Writes out any content currently buffered in the fragment stack.
+
+    Args:
+      newlines: The minimum number of newlines required in the output after this
+          line. These newlines won't be written out until the next line with
+          content is encountered.
+    """
+
+    # Generate indent and the content, then clear content in fragments.
+    indent = io.StringIO()
+    self._fragment_tree.ApplyToAllFragments(
+        lambda fragment: fragment.WriteIndent(indent),
+        lambda fragment: fragment.WriteContentIntoParentAndClear())
+    last_fragment = self._fragment_tree.PeekFragmentFromEnd()
+    content = self._fragment_tree.PeekFragmentFromStart().ConsumeContent()
+    content = last_fragment.StripLine(content.getvalue())
+    indent = indent.getvalue()
+    content = last_fragment.WrapLine(content, indent)
+
+    # Write the content, if any.
+    if content:
+      self._out.write('\n' * self._pending_newlines)
+      self._out.write(indent)
+      self._out.write(content)
+      self._pending_newlines = newlines
+    elif self._pending_newlines > 0 and self._pending_newlines < newlines:
+      self._pending_newlines = newlines
+
+    if FLAGS.debug_print_tree:
+      # Separate trees printed during each writing session
+      print('-' * 20)
+
+
+class XhtmlHandler(xml.sax.ContentHandler):
+  """Translates SAX events into MarkdownGenerator calls."""
+
+  # regex that matches an HTML header tag and extracts the level.
+  _HEADER_TAG_RE = re.compile(r'h([1-6])$')
+
+  def __init__(self, out, url_translator):
+    xml.sax.ContentHandler.__init__(self)
+    self._generator = MarkdownGenerator(out, url_translator)
+
+  def startDocument(self):
+    self._generator.StartDocument()
+
+  def endDocument(self):
+    self._generator.EndDocument()
+
+  def startElementNS(self, name, qname, attrs):
+    tag = name[1]
+    if tag == 'a':
+      href = attrs.get((None, 'href'))
+      self._generator.StartAnchor(href)
+    elif tag == 'br':
+      self._generator.Break()
+    elif tag == 'hr':
+      self._generator.HorizontalRule()
+    elif tag == 'li':
+      self._generator.StartListItem()
+    elif tag == 'div':
+      cls = attrs.get((None, 'class'))
+      style = attrs.get((None, 'style'))
+      self._generator.StartDiv(cls, style)
+    elif tag == 'p':
+      self._generator.StartParagraph()
+    elif tag in ('b', 'code', 'em', 'i', 'strong', 's', 'strike', 'del', 'u'):
+      self._generator.StartFormat(tag)
+    elif tag in ('ul', 'ol'):
+      self._generator.StartList(tag)
+    elif tag == 'img':
+      src = attrs.get((None, 'src'))
+      alt = attrs.get((None, 'alt'))
+      width = attrs.get((None, 'width'))
+      height = attrs.get((None, 'height'))
+      self._generator.Image(src, alt, width, height)
+    elif tag == 'blockquote':
+      self._generator.StartBlockquote()
+    elif tag == 'iframe':
+      src = attrs.get((None, 'src'))
+      width = attrs.get((None, 'width'))
+      height = attrs.get((None, 'height'))
+      self._generator.Iframe(src, width, height)
+    else:
+      match = self._HEADER_TAG_RE.match(tag)
+      if match:
+        level = int(match.group(1))
+        self._generator.StartHeader(level)
+
+  def endElementNS(self, name, qname):
+    tag = name[1]
+    if tag == 'a':
+      self._generator.EndAnchor()
+    elif tag == 'li':
+      self._generator.EndListItem()
+    elif tag == 'div':
+      self._generator.EndDiv()
+    elif tag == 'p':
+      self._generator.EndParagraph()
+    elif tag in ('b', 'code', 'em', 'i', 'strong', 's', 'strike', 'del', 'u'):
+      self._generator.EndFormat()
+    elif tag in ('ul', 'ol'):
+      self._generator.EndList()
+    elif tag == 'blockquote':
+      self._generator.EndBlockquote()
+    else:
+      match = self._HEADER_TAG_RE.match(tag)
+      if match:
+        self._generator.EndHeader()
+
+  def characters(self, content):
+    self._generator.Text(content)
+
+
+class DefaultUrlTranslator:
+  """No-op UrlTranslator."""
+
+  def Translate(self, href):
+    return href
+
+
+def Convert(input_stream, output_stream, url_translator=DefaultUrlTranslator()):
   """Converts an input stream of xhtml into an output stream of markdown.
 
   Args:
@@ -23,6 +1191,7 @@
      output_stream: filehandle for the Markdown output.
      url_translator: Callback for translating URLs embedded in the page.
   """
-  # TODO(dpranke): replace this with the real conversion routine.
-  del url_translator
-  output_stream.write(input_stream.read())
+  parser = xml.sax.make_parser()
+  parser.setContentHandler(XhtmlHandler(output_stream, url_translator))
+  parser.setFeature(xml.sax.handler.feature_namespaces, 1)
+  parser.parse(input_stream)
commit	8d7c9e4f009861c707468a3b8513e039e522005d	[log] [tgz]
author	Dirk Pranke <dpranke@google.com>	Thu Oct 14 00:32:32 2021
committer	Dirk Pranke <dpranke@google.com>	Thu Oct 14 00:32:32 2021
tree	301860a359b00e80e2a693daa02c793a7100f122
parent	20d99f3155c20776eb26b9f5bb3eef205e19b178 [diff]