add real html2markdown.py code
diff --git a/scripts/html2markdown.py b/scripts/html2markdown.py
index 367d30d..699b12a 100644
--- a/scripts/html2markdown.py
+++ b/scripts/html2markdown.py
@@ -14,8 +14,1176 @@
"""HTML to Markdown renderer."""
+import os
+import re
+import io
+import textwrap
+import urllib
+import xml.sax
-def Convert(input_stream, output_stream, url_translator):
+
+class _Flags:
+ # Whether to render h1s and h2s with underlined - and =.
+ underline_headers = False
+
+ # The set of characters to escape with \'\\\' in the
+ # Markdown. This is not the set of all special Markdown
+ # characters, but rather those characters that tend to
+ # get misinterpreted as Markdown syntax the most. Blindly
+ # escaping all special Markdown characters results in ugly
+ # Markdown.
+ escape_chars = r'\`*[]'
+
+ # Format for italic tags.
+ italic_format = '*'
+
+ # Format for bold tags.
+ bold_format = '**'
+
+ # Format for strikethrough tags.
+ strike_format = '~~'
+
+ # Format for underline tags.
+ highlight_format = '=='
+
+ # Number of spaces to indent an unordered list.
+ # This total includes the bullet.
+ # For example, a value of 4 yields '* '
+ unordered_list_indent = 4
+
+ # Number of spaces to indent an ordered list.
+ # This total includes the number.
+ # For example, a value of 4 yields '1. '
+ ordered_list_indent = 4
+
+ # The DIV blocks that should be formatted as code.
+ code_class_regex = r'^sites-codeblock sites-codesnippet-block$'
+
+ # The class of DIV blocks used for table of contents.
+ toc_class_regex = r'^sites-embed-content sites-embed-type-toc$'
+
+ # The class of DIV blocks that should be ignored.
+ ignore_class_regex = r''
+
+ # The style of DIV blocks that should be ignored.
+ ignore_style_regex = r'^display:none;$'
+
+ # Format text blocks to the given line width. Set to zero
+ # to disable line wrapping.
+ line_width = 80
+
+ # Whether to use indented code blocks, if False use fenced.
+ indented_code_blocks = False
+
+ # Whether to use HTML code blocks instead of fenced code
+ # blocks if source code block includes formatted text.
+ allow_html_code_blocks = True
+
+ # Links that are automatically recognized by the renderer.
+ shortlinks_regex = r'^http://(ag|b|cl|g|go|who)/'
+
+ # Print the fragment tree for debugging.
+ debug_print_tree = False
+
+
+FLAGS = _Flags()
+
+
+def _EscapeText(text, reserved_chars):
+ """Escapes any reserved characters with a backslash.
+
+ Args:
+ text: The string to escape.
+ reserved_chars: A string of reserved characters that need to be escaped.
+
+ Returns:
+ The escaped text.
+ """
+ markdown = io.StringIO()
+ for c in text:
+ if c in reserved_chars:
+ markdown.write('\\')
+ markdown.write(c)
+ return markdown.getvalue()
+
+
+def _EscapeContentForHtml(text):
+ result = io.StringIO()
+ escapes = {'<': '<', '>': '>'}
+ for c in text:
+ result.write(c if c not in escapes else escapes[c])
+ return result
+
+
+ENCODED_NEWLINE = '&#%d;' % ord('\n')
+
+
+def _RestoreEncodedNewlines(text):
+ return text.replace(ENCODED_NEWLINE, '\n')
+
+
+def _WrapLine(line, indent):
+ """Wraps the line to fit into the column limit.
+
+ Args:
+ line: The string to wrap.
+ indent: An integer with the number of columns of indentation.
+
+ Returns:
+ The wrapped text.
+ """
+ if FLAGS.line_width > 0:
+ return ('\n' + ' ' * indent).join(textwrap.wrap(
+ line,
+ width=FLAGS.line_width - indent,
+ break_long_words=False,
+ break_on_hyphens=False))
+ return line
+
+
+class Fragment:
+ """Base class for all output fragments.
+
+ To generate a line of output, the methods will be called in the following
+ order:
+
+ WriteIndent()
+ WriteContentIntoParentAndClear()
+ ConsumeContent() -- for the topmost fragment only
+ StripLine()
+ WrapLine()
+ """
+
+ def __init__(self, indent, prefix, suffix):
+ self._content = io.StringIO()
+ self._indent = indent
+ self._prefix = prefix
+ self._suffix = suffix
+ self._parent = None
+ self._children = []
+
+ def __repr__(self):
+ debug_print = lambda text: text.encode('utf-8') if text else ''
+ return ('{' +
+ self.__class__.__name__ +
+ ': indent=' + debug_print(self._indent) +
+ '; prefix=' + debug_print(self._prefix) +
+ '; content=' + debug_print(self._content.getvalue()) +
+ '; suffix=' + debug_print(self._suffix) +
+ '}')
+
+ def SetParent(self, parent):
+ self._parent = parent
+
+ def AddChild(self, node):
+ self._children.append(node)
+ node.SetParent(self)
+ return node
+
+ def GetChildren(self):
+ return self._children
+
+ def _AllChildren(self):
+ all_children = []
+ def Traverse(fragment):
+ for c in fragment.GetChildren():
+ all_children.append(c)
+ Traverse(c)
+ Traverse(self)
+ return all_children
+
+ def WriteIndent(self, output):
+ if self._indent:
+ output.write(self._indent)
+
+ def WriteContentIntoParentAndClear(self):
+ self._WriteContent(self._parent._content) # pylint: disable=protected-access
+ self._ClearContent()
+ self._children = []
+
+ def _WriteContent(self, output):
+ """Implementation of content rendering. Can be overridden in subclasses."""
+ self._Write(output, self._prefix, self._content.getvalue(), self._suffix)
+
+ def _Write(self, output, prefix, content, suffix):
+ """Default implementation of content rendering for reuse by subclasses."""
+ has_content = bool(content.strip())
+ if prefix and has_content:
+ output.write(prefix)
+ output.write(content)
+ if suffix and has_content:
+ output.write(suffix)
+
+ def UnsetSuffix(self):
+ self._suffix = ''
+
+ def UnsetPrefix(self):
+ self._prefix = ''
+
+ def _UpdatePrefixAndSuffix(self, prefix, suffix):
+ if self._prefix:
+ self._prefix = prefix
+ if self._suffix:
+ self._suffix = suffix
+
+ def _ClearContent(self):
+ """Clears the content. This will only be called after it's been written."""
+ self._content = io.StringIO()
+
+ def ConsumeContent(self):
+ content = self._content
+ self._ClearContent()
+ return content
+
+ def Append(self, text):
+ """Appends text.
+
+ Args:
+ text: The string to append, it will be escaped.
+ """
+ assert isinstance(text, str)
+ self._content.write(self.EscapeText(text))
+
+ def EscapeText(self, text):
+ """Escapes any reserved characters when Append() is called with text.
+
+ By default this defers to the parent fragment.
+
+ Args:
+ text: The string to escape.
+
+ Returns:
+ The escaped string.
+ """
+ if self._parent:
+ return self._parent.EscapeText(text)
+ return text
+
+ def StripLine(self, text):
+ """Does any needed stripping of whitespace.
+
+ Some blocks (code for example) will want to preserve whitespace, while
+ others will want to coalesce it together. By default this defers to the
+ parent fragment.
+
+ Args:
+ text: The string to strip
+
+ Returns:
+ The stripped string.
+ """
+ if self._parent:
+ return self._parent.StripLine(text)
+ return text
+
+ def WrapLine(self, line, indent):
+ """Wraps the line to fit into the column limit, if necessary.
+
+ Most blocks (code for example) will want to preserve whitespace and won't
+ break their output.
+
+ Args:
+ text: The string to wrap.
+ indent: Indent string.
+ Returns:
+ The wrapped string.
+ """
+ del indent
+ return line
+
+ def NeedsToMergeWith(self, text):
+ del text
+ return False
+
+
+class Text(Fragment):
+ """Markdown fragment that consists of just a string."""
+
+ def __init__(self, indent=None, prefix=None, suffix=None):
+ super().__init__(indent, prefix, suffix)
+
+
+class IgnoreBlock(Fragment):
+ """Markdown fragment that omits all content."""
+
+ def __init__(self):
+ super().__init__(None, None, None)
+
+
+class TextBlock(Text):
+ """A TextBlock coalesces all spaces and escapes all reserved chars."""
+
+ def EscapeText(self, text):
+ return _EscapeText(text, FLAGS.escape_chars)
+
+ def StripLine(self, text):
+ # Treat newlines as spaces and then coalesce spaces.
+ text = text.replace('\n', ' ')
+ # Replace all Unicode nonbreaking spaces with simple spaces. This is safer
+ # than deletion since spaces are coalesced below anyway.
+ text = text.replace(chr(160), ' ')
+
+ return re.sub(r' +', ' ', text.strip())
+
+
+class WrappedTextBlock(TextBlock):
+ """A WrappedTextBlock wraps the output lines to fit into the column limit."""
+
+ def WrapLine(self, line, indent):
+ return _WrapLine(line, len(indent))
+
+
+class BlockquoteBlock(WrappedTextBlock):
+ """A BlockquoteBlock wraps content and prepends each line with '> '.
+
+ The generator must emit BlockquoteBlocks with no indent for paragraphs
+ inside a blockquote. This will allow propagating the final call to WrapLine
+ up to the outermost BlockquoteBlock which will wrap the lines and prepend
+ each of them with the indent.
+ """
+
+ def __init__(self, indent='> '):
+ super().__init__(indent, None, None)
+
+ def WrapLine(self, line, indent):
+ if not self._indent and self._parent:
+ return self._parent.WrapLine(line, indent)
+ wrapped = _WrapLine(line, len(indent))
+ lines = wrapped.splitlines(True)
+ return indent.join([l.lstrip() for l in lines])
+
+
+class CodeBlock(Text):
+ """Base class for different code block fragment implementations."""
+
+ def EscapeText(self, text):
+ return text
+
+ def StripLine(self, text):
+ # Completely ignore newlines in code blocks. Sites always uses <br/>.
+ return text.replace('\n', '')
+
+ def ChangeToHtml(self):
+ content = self._content.getvalue()
+ if content:
+ self._content = _EscapeContentForHtml(content)
+
+
+class IndentedCodeBlock(CodeBlock):
+ """A IndentedCodeBlock indents by four spaces."""
+
+ def __init__(self, indent=' '):
+ super().__init__(indent, None, None)
+
+
+class FencedCodeBlock(CodeBlock):
+ """A FencedCodeBlock is fenced with triple backticks (```).
+
+ To render correctly, content writing must not happen
+ unless the end of the source code block has been encountered.
+ That is, the entire code block from the source HTML must
+ be rendered in a single write pass.
+ """
+
+ def __init__(self, indent=None,
+ prefix='```none' + ENCODED_NEWLINE,
+ suffix=ENCODED_NEWLINE + '```'):
+ super().__init__(indent, prefix, suffix)
+
+ def WriteIndent(self, output):
+ # Adjust inner fragments and self before rendering.
+ if FLAGS.allow_html_code_blocks:
+ has_formatted_text = False
+ for c in self._AllChildren():
+ if isinstance(c, FormattedText):
+ c.ChangeToHtml()
+ has_formatted_text = True
+ if has_formatted_text:
+ for c in self._AllChildren():
+ if isinstance(c, CodeBlock):
+ c.ChangeToHtml()
+ self._UpdatePrefixAndSuffix(
+ '<pre><code>', ENCODED_NEWLINE + '</code></pre>')
+ super().WriteIndent(output)
+
+ def StripLine(self, text):
+ text = super().StripLine(text)
+ lines = _RestoreEncodedNewlines(text).splitlines()
+ return '\n'.join([l for l in lines if l])
+
+ def WrapLine(self, line, indent):
+ lines = line.splitlines(True)
+ return indent.join(lines)
+
+
+class FencedCodeBlockLine(Text):
+ """A line of code inside FencedCodeBlock."""
+
+ def __init__(self, indent=None,
+ prefix=ENCODED_NEWLINE, suffix=ENCODED_NEWLINE):
+ super().__init__(indent, prefix, suffix)
+
+ def StripLine(self, text):
+ text = super().StripLine(text)
+ return _RestoreEncodedNewlines(text)
+
+
+class UnderlinedHeader(TextBlock):
+ """Markdown fragment for an underlined section header."""
+
+ def __init__(self, char):
+ super().__init__()
+ self._char = char
+
+ def _WriteContent(self, output):
+ length = len(self.StripLine(self._content.getvalue()))
+ if length > 0:
+ # '\n' will be stripped, so use an encoded '\n' that we can later replace
+ # after the line is stripped.
+ self._Write(output,
+ None,
+ self._content.getvalue(),
+ ENCODED_NEWLINE + self._char * length)
+
+ def StripLine(self, text):
+ text = super().StripLine(text)
+ return _RestoreEncodedNewlines(text)
+
+
+class FormattedText(Text):
+ """Text wrapped in Markdown formatting."""
+
+ def __init__(self, fmt):
+ super().__init__(None, fmt, fmt)
+
+ def _Pad(self, bigger, smaller):
+ return ' ' * (len(bigger) - len(smaller))
+
+ def _WriteContent(self, output):
+ prefix = self._prefix
+ content = self._content.getvalue()
+ suffix = self._suffix
+ if prefix:
+ # If there are whitespaces immediately after the prefix,
+ # they must be pushed out before the prefix.
+ lstripped = content.lstrip()
+ if len(content) > len(lstripped):
+ prefix = self._Pad(content, lstripped) + prefix
+ content = lstripped
+ if suffix:
+ # If there are whitespaces immediately before the suffix,
+ # they must be pushed out after the suffix.
+ rstripped = content.rstrip()
+ if len(content) > len(rstripped):
+ suffix = suffix + self._Pad(content, rstripped)
+ content = rstripped
+ self._Write(output, prefix, content, suffix)
+
+ def ChangeToHtml(self):
+ content = self._content.getvalue()
+ if content:
+ content = _EscapeContentForHtml(content)
+
+
+class BoldFormattedText(FormattedText):
+ """Text formatted as bold."""
+
+ def __init__(self):
+ super().__init__(FLAGS.bold_format)
+
+ def NeedsToMergeWith(self, text):
+ return isinstance(text, BoldFormattedText)
+
+ def ChangeToHtml(self):
+ super().ChangeToHtml()
+ self._UpdatePrefixAndSuffix('<b>', '</b>')
+
+
+class ItalicFormattedText(FormattedText):
+ """Text formatted as italic."""
+
+ def __init__(self):
+ super().__init__(FLAGS.italic_format)
+
+ def NeedsToMergeWith(self, text):
+ return isinstance(text, ItalicFormattedText)
+
+ def ChangeToHtml(self):
+ super().ChangeToHtml()
+ self._UpdatePrefixAndSuffix('<i>', '</i>')
+
+
+class StrikeThroughFormattedText(FormattedText):
+ """Text formatted as strike through."""
+
+ def __init__(self):
+ super().__init__(FLAGS.strike_format)
+
+ def NeedsToMergeWith(self, text):
+ return isinstance(text, StrikeThroughFormattedText)
+
+ def ChangeToHtml(self):
+ super().ChangeToHtml()
+ self._UpdatePrefixAndSuffix('<s>', '</s>')
+
+
+class HighlightFormattedText(FormattedText):
+ """Highlighted text."""
+
+ def __init__(self):
+ super().__init__(FLAGS.highlight_format)
+
+ def NeedsToMergeWith(self, text):
+ return isinstance(text, HighlightFormattedText)
+
+ def ChangeToHtml(self):
+ super().ChangeToHtml()
+ self._UpdatePrefixAndSuffix('<u>', '</u>')
+
+
+class ListItem(Text):
+ """Item in a list."""
+
+ def __init__(self, bullet):
+ super().__init__()
+ self._bullet = bullet
+
+ def WriteIndent(self, output):
+ if self._bullet:
+ # TODO(dpranke): The original code relied on strings and bytes
+ # being interchangeable in Python2, so you could seek backwards
+ # from the current location with a relative offset. You can't
+ # do that in Python3, apparently.
+ #
+ # To get around this for the moment, instead of seeking backwards
+ # 4 characters, we embed 4 '\b' backspaces, and then have the client
+ # do a global search and replace of ' \b\b\b\b' with '' instead.
+ #
+ # This is awkward, so we should rework this so that this isn't needed.
+ #
+ # output.seek(-len(self._bullet), os.SEEK_CUR)
+ output.write('\b' * len(self._bullet))
+ output.write(self._bullet)
+ super().WriteIndent(output)
+
+ def _ClearContent(self):
+ self._bullet = None
+ super()._ClearContent()
+
+ def WrapLine(self, line, indent):
+ return _WrapLine(line, len(indent))
+
+
+class Link(Text):
+ """Markdown link."""
+
+ def __init__(self, href):
+ super().__init__()
+ self._href = href
+ self._url_opener_prefix = ''
+ self._url_opener_suffix = ''
+
+ def MakeAnImage(self, width, height):
+ self._url_opener_prefix = '!'
+ if width and height:
+ self._url_opener_suffix = (
+ '{{width="{}" height="{}"}}'.format(width, height))
+
+ def _IsShortLink(self, text):
+ if FLAGS.shortlinks_regex and (
+ re.compile(FLAGS.shortlinks_regex).match(self._href)):
+ parsed_href = urllib.parse.urlsplit(self._href)
+ if parsed_href.netloc + parsed_href.path == text:
+ return True
+ return None
+
+ def _WriteLink(self, output, text):
+ write_short_link = (not (self._url_opener_prefix or self._url_opener_suffix)
+ and self._IsShortLink(text))
+ if write_short_link:
+ self._Write(output, None, text, None)
+ else:
+ self._Write(output,
+ self._url_opener_prefix + '[',
+ text,
+ '](' + self._href + ')' + self._url_opener_suffix)
+
+ def _WriteContent(self, output):
+ text = self._content.getvalue()
+ if text:
+ if text.startswith('http://') or text.startswith('https://'):
+ self._Write(output, '<', text, '>')
+ else:
+ self._WriteLink(output, text)
+
+
+class Image(Text):
+ """Image."""
+
+ def __init__(self, src, alt, width, height):
+ super().__init__()
+ self._src = src
+ self._alt = alt or 'image'
+ self._width = width
+ self._height = height
+
+ def _WriteContent(self, output):
+ self._Write(output, '![', self._alt + '](' + self._src, ')')
+
+
+class Code(Text):
+ """Inline code."""
+
+ def __init__(self):
+ super().__init__(None, '`', '`')
+
+ def EscapeText(self, text):
+ return text
+
+ def _WriteContent(self, output):
+ prefix = self._prefix
+ content = self._content.getvalue()
+ suffix = self._suffix
+ if '`' in content:
+ # If a backtick (`) is present inside inline code, the fragment
+ # must use double backticks.
+ prefix = suffix = '``'
+ # Since having content starting or ending with a backtick would emit
+ # triple backticks which designates a fenced code fragment, pad content
+ # to avoid this.
+ if content.startswith('`'):
+ content = ' ' + content
+ if content.endswith('`'):
+ content += ' '
+ self._Write(output, prefix, content, suffix)
+
+ def NeedsToMergeWith(self, text):
+ return isinstance(text, Code)
+
+
+class EmbeddedContent(Text):
+ """Embedded content: Docs, Drawings, Presentations, etc."""
+
+ def __init__(self, href, width, height):
+ super().__init__()
+ self._href = href
+ self._width = width
+ self._height = height
+
+ def _WriteContent(self, output):
+ parsed_href = urllib.parse.urlsplit(self._href)
+ if parsed_href.scheme == 'http':
+ parsed_href = urllib.parse.SplitResult(
+ 'https', parsed_href.netloc, parsed_href.path, parsed_href.query,
+ parsed_href.fragment)
+ # Note: 'allow="fullscreen"' is requested for all content for simplicity.
+ # g3doc server has dedicated logic to deal with these requests.
+ element = '<iframe src="{}"{} allow="fullscreen" />'.format(
+ urllib.parse.urlunsplit(parsed_href),
+ (' width="{}" height="{}"'.format(self._width, self._height) if (
+ self._width and self._height) else ''))
+ self._Write(output, None, element, None)
+
+
+class ListInfo:
+
+ def __init__(self, tag):
+ self.tag = tag # The tag used to start the list
+ self.item_count = 0 # The number of items in the list
+
+
+class FragmentTree:
+ """Class for managing a tree of fragments.
+
+ There is a "scope" formed by nested fragments, e.g.
+ italic fragment inside bold fragment inside paragraph.
+ The scope is stored in the stack. For convenience,
+ the stack always have one element.
+
+ Fragments popped out from the scope may be re-added
+ back into the tree as children of the last fragment.
+ This allows "chaining" of structured content for future
+ processing. For example, if there were several bold
+ fragments inside a paragraph interleaved with fragments
+ of regular text, all these fragments will end up as
+ children of the paragraph fragment.
+
+ """
+
+ def __init__(self, top_fragment):
+ self._stack = [top_fragment]
+
+ def ActiveFragmentScopeDepth(self):
+ return len(self._stack) - 1
+
+ def StartFragment(self, fragment):
+ fragment.SetParent(self._stack[-1])
+ self._stack.append(fragment)
+ return fragment
+
+ def EndFragment(self):
+ return self._stack.pop()
+
+ def AppendFragment(self, fragment):
+ return self._stack[-1].AddChild(fragment)
+
+ def _ApplyRecursivelyToNode(self, node, scope_operation, operation, # pylint: disable=missing-docstring
+ debug_indent):
+ if not debug_indent:
+ for child in node.GetChildren():
+ self._ApplyRecursivelyToNode(child, scope_operation, operation, None)
+ else:
+ debug_indent += ' c '
+ for child in node.GetChildren():
+ print(debug_indent + repr(child))
+ self._ApplyRecursivelyToNode(child, scope_operation, operation,
+ debug_indent)
+ operation(node)
+
+ def _ApplyRecursivelyToScope(self, nodes, scope_operation, operation, # pylint: disable=missing-docstring
+ debug_indent):
+ node = nodes.pop()
+ scope_operation(node)
+ if debug_indent:
+ print(debug_indent + repr(node))
+ if nodes:
+ self._ApplyRecursivelyToScope(nodes, scope_operation, operation,
+ (debug_indent + ' s ' if debug_indent
+ else None))
+ self._ApplyRecursivelyToNode(node, scope_operation, operation,
+ debug_indent)
+
+ def ApplyToAllFragments(self, scope_operation, operation):
+ """Recursively applies operations to all fragments in the tree.
+
+ The omnipresent topmost fragment is excluded. The 'scope_operation'
+ is applied to every element in the fragment stack in pre-order.
+ The 'operation' is applied to all fragments in the tree in post-order.
+
+ Args:
+ scope_operation: The operation to apply to fragments in the scope stack.
+ operation: The operation to apply to all fragments in the tree.
+ """
+ self._ApplyRecursivelyToScope(list(reversed(self._stack[1:])),
+ scope_operation, operation,
+ ' ' if FLAGS.debug_print_tree else None)
+
+ def FindFirstFragmentFromEnd(self, predicate, steps_from_last=0):
+ sub_stack = self._stack[:-steps_from_last if steps_from_last else None]
+ return next((node for node in sub_stack if predicate(node)), None)
+
+ def PeekFragmentFromStart(self, steps_from_first=0):
+ return self._stack[steps_from_first]
+
+ def PeekFragmentFromEnd(self, steps_from_last=0):
+ return self._stack[-(steps_from_last + 1)]
+
+ def PeekLastAppendedFragment(self):
+ return (self._stack[-1].GetChildren()[-1]
+ if self._stack[-1].GetChildren() else None)
+
+
+class MarkdownGenerator:
+ """Generates Markdown based on the series of HTML tags seen.
+
+ Each time an opening HTML tag is seen, the appropriate markdown fragment is
+ created and pushed onto a stack. Any text encountered is appended to the
+ fragment at the top of the stack. When a closing HTML tag is seen, the stack
+ is popped and the fragment removed is appended to the new top of the stack.
+
+ Markdown is buffered in the fragment stack until an entire line has been
+ formed, at which point _WriteFragmentsAsLine() is called to write it out. The
+ content buffered in the stack is cleared, but otherwise the stack remains
+ unmodified.
+ """
+
+ def __init__(self, out, url_translator):
+ self._out = out
+ self._url_translator = url_translator
+ self._fragment_tree = FragmentTree(Text())
+ self._list_info_stack = []
+ self._pending_newlines = 0
+ # Initialize the regexps to match nothing (rather than be None).
+ self._code_class_regex = re.compile(FLAGS.code_class_regex or 'a^')
+ self._toc_class_regex = re.compile(FLAGS.toc_class_regex or 'a^')
+ self._ignore_class_regex = re.compile(FLAGS.ignore_class_regex or 'a^')
+ self._ignore_style_regex = re.compile(FLAGS.ignore_style_regex or 'a^')
+
+ def _Push(self, fragment):
+ """Sets the parent fragment and pushes it onto the fragment stack.
+
+ In the case where there is an IgnoreBlock on the stack, a new IgnoreBlock
+ is pushed instead.
+
+ Args:
+ fragment: The Fragment object to push on the stack.
+ """
+ if isinstance(self._fragment_tree.PeekFragmentFromEnd(), IgnoreBlock):
+ # If the top of the stack is IgnoreBlock, push an IgnoreBlock instead.
+ fragment = IgnoreBlock()
+ else:
+ # Check if we need to merge adjacent formatting, e.g.
+ # instead of **bold****bold** we need to write **boldbold**,
+ # as the former is not correct Markdown syntax.
+ last_appended = self._fragment_tree.PeekLastAppendedFragment()
+ if last_appended and last_appended.NeedsToMergeWith(fragment):
+ last_appended.UnsetSuffix()
+ fragment.UnsetPrefix()
+
+ self._fragment_tree.StartFragment(fragment)
+
+ def _Pop(self):
+ """Pops the fragment stack it to the new top of stack.
+
+ If the fragment stack would be empty after popping, then the fragment is
+ written to the output first.
+ """
+ if self._fragment_tree.ActiveFragmentScopeDepth() > 1:
+ fragment = self._fragment_tree.EndFragment()
+ self._fragment_tree.AppendFragment(fragment)
+ else:
+ self._WriteFragmentsAsLine(newlines=0)
+ self._fragment_tree.EndFragment()
+
+ def _IsWithinFragmentType(self, fragment_type, steps_from_last=0):
+ return self._fragment_tree.FindFirstFragmentFromEnd(
+ lambda fragment: isinstance(fragment, fragment_type),
+ steps_from_last) is not None
+
+ def Break(self):
+ if not self._IsWithinFragmentType(FencedCodeBlock):
+ self._WriteFragmentsAsLine(newlines=1)
+ else:
+ fragment = FencedCodeBlockLine(prefix='', suffix='')
+ self._Push(fragment)
+ fragment.Append(ENCODED_NEWLINE)
+ self._Pop()
+
+ def HorizontalRule(self):
+ # Horizontal rule must be preceded and followed by a blank line
+ self._AddVerticallyPaddedParagraph('---')
+
+ def StartDocument(self):
+ self._Push(WrappedTextBlock())
+
+ def EndDocument(self):
+ self._Pop()
+
+ def StartParagraph(self):
+ self._WriteFragmentsAsLine(newlines=2)
+
+ def EndParagraph(self):
+ self._WriteFragmentsAsLine(newlines=2)
+
+ def StartDiv(self, cls, style):
+ """Process opening of a div element.
+
+ Args:
+ cls: The class attribute of the element.
+ style: The style attribute of the element.
+ """
+ if not self._IsWithinFragmentType(FencedCodeBlock):
+ if self._IsWithinFragmentType(CodeBlock):
+ self._WriteFragmentsAsLine(newlines=1)
+ else:
+ self._WriteFragmentsAsLine(newlines=2)
+
+ if ((cls and self._ignore_class_regex.match(cls)) or
+ style and self._ignore_style_regex.match(style)):
+ self._Push(IgnoreBlock())
+ elif self._IsWithinFragmentType(FencedCodeBlock):
+ self._Push(FencedCodeBlockLine())
+ elif self._IsWithinFragmentType(CodeBlock):
+ self._Push(CodeBlock())
+ elif self._IsWithinFragmentType(BlockquoteBlock):
+ self._Push(BlockquoteBlock(indent=None))
+ elif cls and self._toc_class_regex.match(cls):
+ self._AddTableOfContents()
+ self._Push(IgnoreBlock()) # Ignore the items inside the Sites TOC
+ elif cls and self._code_class_regex.match(cls):
+ if FLAGS.indented_code_blocks:
+ self._Push(IndentedCodeBlock())
+ else:
+ self._Push(FencedCodeBlock())
+ else:
+ self._Push(WrappedTextBlock())
+
+ def EndDiv(self):
+ if not self._IsWithinFragmentType(FencedCodeBlock, steps_from_last=1):
+ if self._IsWithinFragmentType(CodeBlock, steps_from_last=1):
+ self._WriteFragmentsAsLine(newlines=1)
+ else:
+ self._WriteFragmentsAsLine(newlines=2)
+ self._Pop()
+
+ def StartHeader(self, level):
+ self._WriteFragmentsAsLine(newlines=2)
+ if level == 1 and FLAGS.underline_headers:
+ self._Push(UnderlinedHeader('='))
+ elif level == 2 and FLAGS.underline_headers:
+ self._Push(UnderlinedHeader('-'))
+ else:
+ self._Push(TextBlock(prefix=('#' * level) + ' '))
+
+ def EndHeader(self):
+ self._WriteFragmentsAsLine(newlines=2)
+ self._Pop()
+
+ def StartList(self, tag):
+ if not self._list_info_stack:
+ self._WriteFragmentsAsLine(newlines=2)
+ else:
+ self._WriteFragmentsAsLine(newlines=1)
+ self._list_info_stack.append(ListInfo(tag))
+ if tag == 'ol':
+ self._Push(Text(' ' * FLAGS.ordered_list_indent))
+ else:
+ self._Push(Text(' ' * FLAGS.unordered_list_indent))
+
+ def EndList(self):
+ self._list_info_stack.pop()
+ if not self._list_info_stack:
+ self._WriteFragmentsAsLine(newlines=2)
+ else:
+ self._WriteFragmentsAsLine(newlines=1)
+ self._Pop()
+
+ def StartListItem(self):
+ self._WriteFragmentsAsLine(newlines=1)
+ # Google Sites sometimes spits out pages with <li> tags not enclosed within
+ # an <ol> or <ul> tag.
+ tag = ''
+ if self._list_info_stack:
+ self._list_info_stack[-1].item_count += 1
+ tag = self._list_info_stack[-1].tag
+ if tag == 'ol':
+ item_count = self._list_info_stack[-1].item_count
+ # string.ljust makes room for as many digits as you need.
+ prefix = ('%d.' % item_count).ljust(FLAGS.ordered_list_indent)
+ self._Push(ListItem(prefix))
+ else:
+ prefix = '*'.ljust(FLAGS.unordered_list_indent)
+ self._Push(ListItem(prefix))
+
+ def EndListItem(self):
+ self._WriteFragmentsAsLine(newlines=1)
+ self._Pop()
+
+ def StartFormat(self, tag):
+ # Allowed formatting depends on the surrounding fragment type.
+ if not self._IsWithinFragmentType(IndentedCodeBlock):
+ formats_map = {
+ 'i': ItalicFormattedText,
+ 'em': ItalicFormattedText,
+ 'b': BoldFormattedText,
+ 'strong': BoldFormattedText,
+ 'strike': StrikeThroughFormattedText,
+ 's': StrikeThroughFormattedText,
+ 'del': StrikeThroughFormattedText,
+ 'u': HighlightFormattedText,
+ 'code': Code,
+ None: Text,
+ }
+ if self._IsWithinFragmentType(FencedCodeBlock):
+ if FLAGS.allow_html_code_blocks:
+ # HTML code block can render formats but must not use Code fragments.
+ formats_map['code'] = formats_map[None] = CodeBlock
+ else:
+ formats_map = {None: CodeBlock}
+ else:
+ # Inside an indented code block no formatting is allowed.
+ formats_map = {None: CodeBlock}
+ self._Push(formats_map[tag]() if tag in formats_map
+ else formats_map[None]())
+
+ def EndFormat(self):
+ self._Pop()
+
+ def StartAnchor(self, href):
+ if href is not None:
+ href = self._url_translator.Translate(href)
+ self._Push(Link(href))
+ else:
+ self._Push(Text())
+
+ def EndAnchor(self):
+ self._Pop()
+
+ def StartBlockquote(self):
+ if not self._IsWithinFragmentType(CodeBlock):
+ self._WriteFragmentsAsLine(newlines=1)
+ self._Push(BlockquoteBlock())
+ else:
+ self._Push(Text())
+
+ def EndBlockquote(self):
+ if not self._IsWithinFragmentType(CodeBlock):
+ self._WriteFragmentsAsLine(newlines=2)
+ self._Pop()
+
+ def Image(self, src, alt, width, height):
+ src = self._url_translator.Translate(src)
+ self._fragment_tree.AppendFragment(Image(src, alt, width, height))
+
+ def Iframe(self, src, width, height):
+ """Process an <iframe> element.
+
+ Sites use <iframe> for embedded content: Docs, Drawings, etc.
+ g3doc implements this by supporting <iframe> HTML tag directly.
+
+ Args:
+ src: Source URL.
+ width: Element width.
+ height: Element height.
+ """
+ if False:
+ # TODO(dpranke): Figure out if we should support embedded IFRAME tags.
+ # For now, we skip over them.
+ self._WriteFragmentsAsLine(newlines=2)
+ self._Push(EmbeddedContent(src, width, height))
+ self._Pop()
+
+ def Text(self, text):
+ if not isinstance(self._fragment_tree.PeekFragmentFromEnd(), IgnoreBlock):
+ fragment = (CodeBlock() if self._IsWithinFragmentType(CodeBlock)
+ else Text())
+ self._fragment_tree.AppendFragment(fragment)
+ fragment.Append(text)
+
+ def _AddTableOfContents(self):
+ # TOC must be preceded and followed by a blank line
+ self._AddVerticallyPaddedParagraph('[TOC]')
+
+ def _AddVerticallyPaddedParagraph(self, text):
+ self._WriteFragmentsAsLine(newlines=2)
+ fragment = CodeBlock() # Use CodeBlock to prevent escaping
+ self._fragment_tree.AppendFragment(fragment)
+ fragment.Append(text)
+ self._WriteFragmentsAsLine(newlines=2)
+
+ def _WriteFragmentsAsLine(self, newlines):
+ """Writes out any content currently buffered in the fragment stack.
+
+ Args:
+ newlines: The minimum number of newlines required in the output after this
+ line. These newlines won't be written out until the next line with
+ content is encountered.
+ """
+
+ # Generate indent and the content, then clear content in fragments.
+ indent = io.StringIO()
+ self._fragment_tree.ApplyToAllFragments(
+ lambda fragment: fragment.WriteIndent(indent),
+ lambda fragment: fragment.WriteContentIntoParentAndClear())
+ last_fragment = self._fragment_tree.PeekFragmentFromEnd()
+ content = self._fragment_tree.PeekFragmentFromStart().ConsumeContent()
+ content = last_fragment.StripLine(content.getvalue())
+ indent = indent.getvalue()
+ content = last_fragment.WrapLine(content, indent)
+
+ # Write the content, if any.
+ if content:
+ self._out.write('\n' * self._pending_newlines)
+ self._out.write(indent)
+ self._out.write(content)
+ self._pending_newlines = newlines
+ elif self._pending_newlines > 0 and self._pending_newlines < newlines:
+ self._pending_newlines = newlines
+
+ if FLAGS.debug_print_tree:
+ # Separate trees printed during each writing session
+ print('-' * 20)
+
+
+class XhtmlHandler(xml.sax.ContentHandler):
+ """Translates SAX events into MarkdownGenerator calls."""
+
+ # regex that matches an HTML header tag and extracts the level.
+ _HEADER_TAG_RE = re.compile(r'h([1-6])$')
+
+ def __init__(self, out, url_translator):
+ xml.sax.ContentHandler.__init__(self)
+ self._generator = MarkdownGenerator(out, url_translator)
+
+ def startDocument(self):
+ self._generator.StartDocument()
+
+ def endDocument(self):
+ self._generator.EndDocument()
+
+ def startElementNS(self, name, qname, attrs):
+ tag = name[1]
+ if tag == 'a':
+ href = attrs.get((None, 'href'))
+ self._generator.StartAnchor(href)
+ elif tag == 'br':
+ self._generator.Break()
+ elif tag == 'hr':
+ self._generator.HorizontalRule()
+ elif tag == 'li':
+ self._generator.StartListItem()
+ elif tag == 'div':
+ cls = attrs.get((None, 'class'))
+ style = attrs.get((None, 'style'))
+ self._generator.StartDiv(cls, style)
+ elif tag == 'p':
+ self._generator.StartParagraph()
+ elif tag in ('b', 'code', 'em', 'i', 'strong', 's', 'strike', 'del', 'u'):
+ self._generator.StartFormat(tag)
+ elif tag in ('ul', 'ol'):
+ self._generator.StartList(tag)
+ elif tag == 'img':
+ src = attrs.get((None, 'src'))
+ alt = attrs.get((None, 'alt'))
+ width = attrs.get((None, 'width'))
+ height = attrs.get((None, 'height'))
+ self._generator.Image(src, alt, width, height)
+ elif tag == 'blockquote':
+ self._generator.StartBlockquote()
+ elif tag == 'iframe':
+ src = attrs.get((None, 'src'))
+ width = attrs.get((None, 'width'))
+ height = attrs.get((None, 'height'))
+ self._generator.Iframe(src, width, height)
+ else:
+ match = self._HEADER_TAG_RE.match(tag)
+ if match:
+ level = int(match.group(1))
+ self._generator.StartHeader(level)
+
+ def endElementNS(self, name, qname):
+ tag = name[1]
+ if tag == 'a':
+ self._generator.EndAnchor()
+ elif tag == 'li':
+ self._generator.EndListItem()
+ elif tag == 'div':
+ self._generator.EndDiv()
+ elif tag == 'p':
+ self._generator.EndParagraph()
+ elif tag in ('b', 'code', 'em', 'i', 'strong', 's', 'strike', 'del', 'u'):
+ self._generator.EndFormat()
+ elif tag in ('ul', 'ol'):
+ self._generator.EndList()
+ elif tag == 'blockquote':
+ self._generator.EndBlockquote()
+ else:
+ match = self._HEADER_TAG_RE.match(tag)
+ if match:
+ self._generator.EndHeader()
+
+ def characters(self, content):
+ self._generator.Text(content)
+
+
+class DefaultUrlTranslator:
+ """No-op UrlTranslator."""
+
+ def Translate(self, href):
+ return href
+
+
+def Convert(input_stream, output_stream, url_translator=DefaultUrlTranslator()):
"""Converts an input stream of xhtml into an output stream of markdown.
Args:
@@ -23,6 +1191,7 @@
output_stream: filehandle for the Markdown output.
url_translator: Callback for translating URLs embedded in the page.
"""
- # TODO(dpranke): replace this with the real conversion routine.
- del url_translator
- output_stream.write(input_stream.read())
+ parser = xml.sax.make_parser()
+ parser.setContentHandler(XhtmlHandler(output_stream, url_translator))
+ parser.setFeature(xml.sax.handler.feature_namespaces, 1)
+ parser.parse(input_stream)