blob: 8526cad71fa03fa377a0e118cec557cdec8b8ea7 [file] [log] [blame]
# Copyright 2021 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""HTML to Markdown renderer."""
import os
import re
import io
import textwrap
import urllib
import xml.sax
class _Flags:
# Whether to render h1s and h2s with underlined - and =.
underline_headers = False
# The set of characters to escape with \'\\\' in the
# Markdown. This is not the set of all special Markdown
# characters, but rather those characters that tend to
# get misinterpreted as Markdown syntax the most. Blindly
# escaping all special Markdown characters results in ugly
# Markdown.
escape_chars = r'\`*[]'
# Format for italic tags.
italic_format = '*'
# Format for bold tags.
bold_format = '**'
# Format for strikethrough tags.
strike_format = '~~'
# Format for underline tags.
highlight_format = '=='
# Number of spaces to indent an unordered list.
# This total includes the bullet.
# For example, a value of 4 yields '* '
unordered_list_indent = 4
# Number of spaces to indent an ordered list.
# This total includes the number.
# For example, a value of 4 yields '1. '
ordered_list_indent = 4
# The DIV blocks that should be formatted as code.
code_class_regex = r'^sites-codeblock sites-codesnippet-block$'
# The class of DIV blocks used for table of contents.
toc_class_regex = r'^sites-embed-content sites-embed-type-toc$'
# The class of DIV blocks that should be ignored.
ignore_class_regex = r''
# The style of DIV blocks that should be ignored.
ignore_style_regex = r'^display:none;$'
# Format text blocks to the given line width. Set to zero
# to disable line wrapping.
line_width = 80
# Whether to use indented code blocks, if False use fenced.
indented_code_blocks = False
# Whether to use HTML code blocks instead of fenced code
# blocks if source code block includes formatted text.
allow_html_code_blocks = True
# Links that are automatically recognized by the renderer.
shortlinks_regex = r'^http://(ag|b|cl|g|go|who)/'
# Print the fragment tree for debugging.
debug_print_tree = False
FLAGS = _Flags()
def _EscapeText(text, reserved_chars):
"""Escapes any reserved characters with a backslash.
Args:
text: The string to escape.
reserved_chars: A string of reserved characters that need to be escaped.
Returns:
The escaped text.
"""
markdown = io.StringIO()
for c in text:
if c in reserved_chars:
markdown.write('\\')
markdown.write(c)
return markdown.getvalue()
def _EscapeContentForHtml(text):
result = io.StringIO()
escapes = {'<': '&lt;', '>': '&gt;'}
for c in text:
result.write(c if c not in escapes else escapes[c])
return result
ENCODED_NEWLINE = '&#%d;' % ord('\n')
def _RestoreEncodedNewlines(text):
return text.replace(ENCODED_NEWLINE, '\n')
def _WrapLine(line, indent):
"""Wraps the line to fit into the column limit.
Args:
line: The string to wrap.
indent: An integer with the number of columns of indentation.
Returns:
The wrapped text.
"""
if FLAGS.line_width > 0:
return ('\n' + ' ' * indent).join(textwrap.wrap(
line,
width=FLAGS.line_width - indent,
break_long_words=False,
break_on_hyphens=False))
return line
class Fragment:
"""Base class for all output fragments.
To generate a line of output, the methods will be called in the following
order:
WriteIndent()
WriteContentIntoParentAndClear()
ConsumeContent() -- for the topmost fragment only
StripLine()
WrapLine()
"""
def __init__(self, indent, prefix, suffix):
self._content = io.StringIO()
self._indent = indent
self._prefix = prefix
self._suffix = suffix
self._parent = None
self._children = []
def __repr__(self):
debug_print = lambda text: text.encode('utf-8') if text else ''
return ('{' +
self.__class__.__name__ +
': indent=' + debug_print(self._indent) +
'; prefix=' + debug_print(self._prefix) +
'; content=' + debug_print(self._content.getvalue()) +
'; suffix=' + debug_print(self._suffix) +
'}')
def SetParent(self, parent):
self._parent = parent
def AddChild(self, node):
self._children.append(node)
node.SetParent(self)
return node
def GetChildren(self):
return self._children
def _AllChildren(self):
all_children = []
def Traverse(fragment):
for c in fragment.GetChildren():
all_children.append(c)
Traverse(c)
Traverse(self)
return all_children
def WriteIndent(self, output):
if self._indent:
output.write(self._indent)
def WriteContentIntoParentAndClear(self):
self._WriteContent(self._parent._content) # pylint: disable=protected-access
self._ClearContent()
self._children = []
def _WriteContent(self, output):
"""Implementation of content rendering. Can be overridden in subclasses."""
self._Write(output, self._prefix, self._content.getvalue(), self._suffix)
def _Write(self, output, prefix, content, suffix):
"""Default implementation of content rendering for reuse by subclasses."""
has_content = bool(content.strip())
if prefix and has_content:
output.write(prefix)
output.write(content)
if suffix and has_content:
output.write(suffix)
def UnsetSuffix(self):
self._suffix = ''
def UnsetPrefix(self):
self._prefix = ''
def _UpdatePrefixAndSuffix(self, prefix, suffix):
if self._prefix:
self._prefix = prefix
if self._suffix:
self._suffix = suffix
def _ClearContent(self):
"""Clears the content. This will only be called after it's been written."""
self._content = io.StringIO()
def ConsumeContent(self):
content = self._content
self._ClearContent()
return content
def Append(self, text):
"""Appends text.
Args:
text: The string to append, it will be escaped.
"""
assert isinstance(text, str)
self._content.write(self.EscapeText(text))
def EscapeText(self, text):
"""Escapes any reserved characters when Append() is called with text.
By default this defers to the parent fragment.
Args:
text: The string to escape.
Returns:
The escaped string.
"""
if self._parent:
return self._parent.EscapeText(text)
return text
def StripLine(self, text):
"""Does any needed stripping of whitespace.
Some blocks (code for example) will want to preserve whitespace, while
others will want to coalesce it together. By default this defers to the
parent fragment.
Args:
text: The string to strip
Returns:
The stripped string.
"""
if self._parent:
return self._parent.StripLine(text)
return text
def WrapLine(self, line, indent):
"""Wraps the line to fit into the column limit, if necessary.
Most blocks (code for example) will want to preserve whitespace and won't
break their output.
Args:
text: The string to wrap.
indent: Indent string.
Returns:
The wrapped string.
"""
del indent
return line
def NeedsToMergeWith(self, text):
del text
return False
class HTML(Fragment):
"""Markdown fragment that consists of just an unescaped HTML string."""
def __init__(self):
super().__init__(indent=None, prefix=None, suffix=None)
def EscapeText(self, text):
return text
class Text(Fragment):
"""Markdown fragment that consists of just a string."""
def __init__(self, indent=None, prefix=None, suffix=None):
super().__init__(indent, prefix, suffix)
class IgnoreBlock(Fragment):
"""Markdown fragment that omits all content."""
def __init__(self):
super().__init__(None, None, None)
class TextBlock(Text):
"""A TextBlock coalesces all spaces and escapes all reserved chars."""
def EscapeText(self, text):
text = _EscapeContentForHtml(text).getvalue()
return _EscapeText(text, FLAGS.escape_chars)
def StripLine(self, text):
# Treat newlines as spaces and then coalesce spaces.
text = text.replace('\n', ' ')
# Replace all Unicode nonbreaking spaces with simple spaces. This is safer
# than deletion since spaces are coalesced below anyway.
text = text.replace(chr(160), ' ')
return re.sub(r' +', ' ', text.strip())
class Div(TextBlock):
"""Placeholder class that helps with the two-column layout conversion."""
def __init__(self, cls):
self.cls = cls
super().__init__()
class WrappedTextBlock(TextBlock):
"""A WrappedTextBlock wraps the output lines to fit into the column limit."""
def WrapLine(self, line, indent):
return _WrapLine(line, len(indent))
class BlockquoteBlock(WrappedTextBlock):
"""A BlockquoteBlock wraps content and prepends each line with '> '.
The generator must emit BlockquoteBlocks with no indent for paragraphs
inside a blockquote. This will allow propagating the final call to WrapLine
up to the outermost BlockquoteBlock which will wrap the lines and prepend
each of them with the indent.
"""
def __init__(self, indent='> '):
super().__init__(indent, None, None)
def WrapLine(self, line, indent):
if not self._indent and self._parent:
return self._parent.WrapLine(line, indent)
wrapped = _WrapLine(line, len(indent))
lines = wrapped.splitlines(True)
return indent.join([l.lstrip() for l in lines])
class CodeBlock(Text):
"""Base class for different code block fragment implementations."""
def EscapeText(self, text):
return text
def StripLine(self, text):
# Completely ignore newlines in code blocks. Sites always uses <br/>.
return text.replace('\n', '')
def ChangeToHtml(self):
content = self._content.getvalue()
if content:
self._content = _EscapeContentForHtml(content)
class IndentedCodeBlock(CodeBlock):
"""A IndentedCodeBlock indents by four spaces."""
def __init__(self, indent=' '):
super().__init__(indent, None, None)
class FencedCodeBlock(CodeBlock):
"""A FencedCodeBlock is fenced with triple backticks (```).
To render correctly, content writing must not happen
unless the end of the source code block has been encountered.
That is, the entire code block from the source HTML must
be rendered in a single write pass.
"""
def __init__(self, indent=None,
prefix='```none' + ENCODED_NEWLINE,
suffix=ENCODED_NEWLINE + '```'):
super().__init__(indent, prefix, suffix)
def WriteIndent(self, output):
# Adjust inner fragments and self before rendering.
if FLAGS.allow_html_code_blocks:
has_formatted_text = False
for c in self._AllChildren():
if isinstance(c, FormattedText):
c.ChangeToHtml()
has_formatted_text = True
if has_formatted_text:
for c in self._AllChildren():
if isinstance(c, CodeBlock):
c.ChangeToHtml()
self._UpdatePrefixAndSuffix(
'<pre><code>', ENCODED_NEWLINE + '</code></pre>')
super().WriteIndent(output)
def StripLine(self, text):
text = super().StripLine(text)
lines = _RestoreEncodedNewlines(text).splitlines()
return '\n'.join([l for l in lines if l])
def WrapLine(self, line, indent):
lines = line.splitlines(True)
return indent.join(lines)
class FencedCodeBlockLine(Text):
"""A line of code inside FencedCodeBlock."""
def __init__(self, indent=None,
prefix=ENCODED_NEWLINE, suffix=ENCODED_NEWLINE):
super().__init__(indent, prefix, suffix)
def StripLine(self, text):
text = super().StripLine(text)
return _RestoreEncodedNewlines(text)
class UnderlinedHeader(TextBlock):
"""Markdown fragment for an underlined section header."""
def __init__(self, char):
super().__init__()
self._char = char
def _WriteContent(self, output):
length = len(self.StripLine(self._content.getvalue()))
if length > 0:
# '\n' will be stripped, so use an encoded '\n' that we can later replace
# after the line is stripped.
self._Write(output,
None,
self._content.getvalue(),
ENCODED_NEWLINE + self._char * length)
def StripLine(self, text):
text = super().StripLine(text)
return _RestoreEncodedNewlines(text)
class FormattedText(Text):
"""Text wrapped in Markdown formatting."""
def __init__(self, fmt):
super().__init__(None, fmt, fmt)
def _Pad(self, bigger, smaller):
return ' ' * (len(bigger) - len(smaller))
def _WriteContent(self, output):
prefix = self._prefix
content = self._content.getvalue()
suffix = self._suffix
if prefix:
# If there are whitespaces immediately after the prefix,
# they must be pushed out before the prefix.
lstripped = content.lstrip()
if len(content) > len(lstripped):
prefix = self._Pad(content, lstripped) + prefix
content = lstripped
if suffix:
# If there are whitespaces immediately before the suffix,
# they must be pushed out after the suffix.
rstripped = content.rstrip()
if len(content) > len(rstripped):
suffix = suffix + self._Pad(content, rstripped)
content = rstripped
self._Write(output, prefix, content, suffix)
def ChangeToHtml(self):
content = self._content.getvalue()
if content:
content = _EscapeContentForHtml(content)
class BoldFormattedText(FormattedText):
"""Text formatted as bold."""
def __init__(self):
super().__init__(FLAGS.bold_format)
def NeedsToMergeWith(self, text):
return isinstance(text, BoldFormattedText)
def ChangeToHtml(self):
super().ChangeToHtml()
self._UpdatePrefixAndSuffix('<b>', '</b>')
class ItalicFormattedText(FormattedText):
"""Text formatted as italic."""
def __init__(self):
super().__init__(FLAGS.italic_format)
def NeedsToMergeWith(self, text):
return isinstance(text, ItalicFormattedText)
def ChangeToHtml(self):
super().ChangeToHtml()
self._UpdatePrefixAndSuffix('<i>', '</i>')
class StrikeThroughFormattedText(FormattedText):
"""Text formatted as strike through."""
def __init__(self):
super().__init__(FLAGS.strike_format)
def NeedsToMergeWith(self, text):
return isinstance(text, StrikeThroughFormattedText)
def ChangeToHtml(self):
super().ChangeToHtml()
self._UpdatePrefixAndSuffix('<s>', '</s>')
class HighlightFormattedText(FormattedText):
"""Highlighted text."""
def __init__(self):
super().__init__(FLAGS.highlight_format)
def NeedsToMergeWith(self, text):
return isinstance(text, HighlightFormattedText)
def ChangeToHtml(self):
super().ChangeToHtml()
self._UpdatePrefixAndSuffix('<u>', '</u>')
class ListItem(Text):
"""Item in a list."""
def __init__(self, bullet):
super().__init__()
self._bullet = bullet
def WriteIndent(self, output):
if self._bullet:
# TODO(dpranke): The original code relied on strings and bytes
# being interchangeable in Python2, so you could seek backwards
# from the current location with a relative offset. You can't
# do that in Python3, apparently.
#
# To get around this for the moment, instead of seeking backwards
# 4 characters, we embed 4 '\b' backspaces, and then have the client
# do a global search and replace of ' \b\b\b\b' with '' instead.
#
# This is awkward, so we should rework this so that this isn't needed.
#
# output.seek(-len(self._bullet), os.SEEK_CUR)
output.write('\b' * len(self._bullet))
output.write(self._bullet)
super().WriteIndent(output)
def _ClearContent(self):
self._bullet = None
super()._ClearContent()
def WrapLine(self, line, indent):
return _WrapLine(line, len(indent))
class Link(Text):
"""Markdown link."""
def __init__(self, href):
super().__init__()
self._href = href
self._url_opener_prefix = ''
self._url_opener_suffix = ''
def MakeAnImage(self, width, height):
self._url_opener_prefix = '!'
if width and height:
self._url_opener_suffix = (
'{{width="{}" height="{}"}}'.format(width, height))
def _IsShortLink(self, text):
if FLAGS.shortlinks_regex and (
re.compile(FLAGS.shortlinks_regex).match(self._href)):
parsed_href = urllib.parse.urlsplit(self._href)
if parsed_href.netloc + parsed_href.path == text:
return True
return None
def _WriteLink(self, output, text):
write_short_link = (not (self._url_opener_prefix or self._url_opener_suffix)
and self._IsShortLink(text))
if write_short_link:
self._Write(output, None, text, None)
else:
self._Write(output,
self._url_opener_prefix + '[',
text,
'](' + self._href + ')' + self._url_opener_suffix)
def _WriteContent(self, output):
text = self._content.getvalue()
if text:
if text.startswith('http://') or text.startswith('https://'):
self._Write(output, '<', text, '>')
else:
self._WriteLink(output, text)
class Image(Text):
"""Image."""
def __init__(self, src, alt, width, height):
super().__init__()
self._src = src
self._alt = alt or 'image'
self._width = width
self._height = height
def _WriteContent(self, output):
tag = '<img alt="%s" src="%s"' % (self._alt, self._src)
if self._height:
tag += ' height=%s' % self._height
if self._width:
tag += ' width=%s' % self._width
tag += '>'
self._Write(output, '', tag, '')
class Code(Text):
"""Inline code."""
def __init__(self):
super().__init__(None, '`', '`')
def EscapeText(self, text):
return text
def _WriteContent(self, output):
prefix = self._prefix
content = self._content.getvalue()
suffix = self._suffix
if '`' in content:
# If a backtick (`) is present inside inline code, the fragment
# must use double backticks.
prefix = suffix = '``'
# Since having content starting or ending with a backtick would emit
# triple backticks which designates a fenced code fragment, pad content
# to avoid this.
if content.startswith('`'):
content = ' ' + content
if content.endswith('`'):
content += ' '
self._Write(output, prefix, content, suffix)
def NeedsToMergeWith(self, text):
return isinstance(text, Code)
class EmbeddedContent(Text):
"""Embedded content: Docs, Drawings, Presentations, etc."""
def __init__(self, href, width, height):
super().__init__()
self._href = href
self._width = width
self._height = height
def _WriteContent(self, output):
parsed_href = urllib.parse.urlsplit(self._href)
if parsed_href.scheme == 'http':
parsed_href = urllib.parse.SplitResult(
'https', parsed_href.netloc, parsed_href.path, parsed_href.query,
parsed_href.fragment)
# Note: 'allow="fullscreen"' is requested for all content for simplicity.
# g3doc server has dedicated logic to deal with these requests.
element = '<iframe src="{}"{} allow="fullscreen" />'.format(
urllib.parse.urlunsplit(parsed_href),
(' width="{}" height="{}"'.format(self._width, self._height) if (
self._width and self._height) else ''))
self._Write(output, None, element, None)
class ListInfo:
def __init__(self, tag):
self.tag = tag # The tag used to start the list
self.item_count = 0 # The number of items in the list
class FragmentTree:
"""Class for managing a tree of fragments.
There is a "scope" formed by nested fragments, e.g.
italic fragment inside bold fragment inside paragraph.
The scope is stored in the stack. For convenience,
the stack always have one element.
Fragments popped out from the scope may be re-added
back into the tree as children of the last fragment.
This allows "chaining" of structured content for future
processing. For example, if there were several bold
fragments inside a paragraph interleaved with fragments
of regular text, all these fragments will end up as
children of the paragraph fragment.
"""
def __init__(self, top_fragment):
self._stack = [top_fragment]
def ActiveFragmentScopeDepth(self):
return len(self._stack) - 1
def StartFragment(self, fragment):
fragment.SetParent(self._stack[-1])
self._stack.append(fragment)
return fragment
def EndFragment(self):
return self._stack.pop()
def AppendFragment(self, fragment):
return self._stack[-1].AddChild(fragment)
def _ApplyRecursivelyToNode(self, node, scope_operation, operation, # pylint: disable=missing-docstring
debug_indent):
if not debug_indent:
for child in node.GetChildren():
self._ApplyRecursivelyToNode(child, scope_operation, operation, None)
else:
debug_indent += ' c '
for child in node.GetChildren():
print(debug_indent + repr(child))
self._ApplyRecursivelyToNode(child, scope_operation, operation,
debug_indent)
operation(node)
def _ApplyRecursivelyToScope(self, nodes, scope_operation, operation, # pylint: disable=missing-docstring
debug_indent):
node = nodes.pop()
scope_operation(node)
if debug_indent:
print(debug_indent + repr(node))
if nodes:
self._ApplyRecursivelyToScope(nodes, scope_operation, operation,
(debug_indent + ' s ' if debug_indent
else None))
self._ApplyRecursivelyToNode(node, scope_operation, operation,
debug_indent)
def ApplyToAllFragments(self, scope_operation, operation):
"""Recursively applies operations to all fragments in the tree.
The omnipresent topmost fragment is excluded. The 'scope_operation'
is applied to every element in the fragment stack in pre-order.
The 'operation' is applied to all fragments in the tree in post-order.
Args:
scope_operation: The operation to apply to fragments in the scope stack.
operation: The operation to apply to all fragments in the tree.
"""
self._ApplyRecursivelyToScope(list(reversed(self._stack[1:])),
scope_operation, operation,
' ' if FLAGS.debug_print_tree else None)
def FindFirstFragmentFromEnd(self, predicate, steps_from_last=0):
sub_stack = self._stack[:-steps_from_last if steps_from_last else None]
return next((node for node in sub_stack if predicate(node)), None)
def PeekFragmentFromStart(self, steps_from_first=0):
return self._stack[steps_from_first]
def PeekFragmentFromEnd(self, steps_from_last=0):
return self._stack[-(steps_from_last + 1)]
def PeekLastAppendedFragment(self):
return (self._stack[-1].GetChildren()[-1]
if self._stack[-1].GetChildren() else None)
class MarkdownGenerator:
"""Generates Markdown based on the series of HTML tags seen.
Each time an opening HTML tag is seen, the appropriate markdown fragment is
created and pushed onto a stack. Any text encountered is appended to the
fragment at the top of the stack. When a closing HTML tag is seen, the stack
is popped and the fragment removed is appended to the new top of the stack.
Markdown is buffered in the fragment stack until an entire line has been
formed, at which point _WriteFragmentsAsLine() is called to write it out. The
content buffered in the stack is cleared, but otherwise the stack remains
unmodified.
"""
def __init__(self, out, url_translator):
self._out = out
self._url_translator = url_translator
self._fragment_tree = FragmentTree(Text())
self._list_info_stack = []
self._pending_newlines = 0
# Initialize the regexps to match nothing (rather than be None).
self._code_class_regex = re.compile(FLAGS.code_class_regex or 'a^')
self._toc_class_regex = re.compile(FLAGS.toc_class_regex or 'a^')
self._ignore_class_regex = re.compile(FLAGS.ignore_class_regex or 'a^')
self._ignore_style_regex = re.compile(FLAGS.ignore_style_regex or 'a^')
def _Push(self, fragment):
"""Sets the parent fragment and pushes it onto the fragment stack.
In the case where there is an IgnoreBlock on the stack, a new IgnoreBlock
is pushed instead.
Args:
fragment: The Fragment object to push on the stack.
"""
if isinstance(self._fragment_tree.PeekFragmentFromEnd(), IgnoreBlock):
# If the top of the stack is IgnoreBlock, push an IgnoreBlock instead.
fragment = IgnoreBlock()
else:
# Check if we need to merge adjacent formatting, e.g.
# instead of **bold****bold** we need to write **boldbold**,
# as the former is not correct Markdown syntax.
last_appended = self._fragment_tree.PeekLastAppendedFragment()
if last_appended and last_appended.NeedsToMergeWith(fragment):
last_appended.UnsetSuffix()
fragment.UnsetPrefix()
self._fragment_tree.StartFragment(fragment)
def _Pop(self):
"""Pops the fragment stack it to the new top of stack.
If the fragment stack would be empty after popping, then the fragment is
written to the output first.
"""
if self._fragment_tree.ActiveFragmentScopeDepth() > 1:
fragment = self._fragment_tree.EndFragment()
self._fragment_tree.AppendFragment(fragment)
else:
self._WriteFragmentsAsLine(newlines=0)
self._fragment_tree.EndFragment()
def _IsWithinFragmentType(self, fragment_type, steps_from_last=0):
return self._fragment_tree.FindFirstFragmentFromEnd(
lambda fragment: isinstance(fragment, fragment_type),
steps_from_last) is not None
def _LastFragmentIs(self, fragment_type, cls):
fragment = self._fragment_tree.PeekFragmentFromEnd()
return (isinstance(fragment, fragment_type) and fragment.cls == cls)
def Break(self):
if not self._IsWithinFragmentType(FencedCodeBlock):
self._WriteFragmentsAsLine(newlines=1)
else:
fragment = FencedCodeBlockLine(prefix='', suffix='')
self._Push(fragment)
fragment.Append(ENCODED_NEWLINE)
self._Pop()
def HorizontalRule(self):
# Horizontal rule must be preceded and followed by a blank line
self._AddVerticallyPaddedParagraph('---')
def StartDocument(self):
self._Push(WrappedTextBlock())
def EndDocument(self):
self._Pop()
def StartParagraph(self):
self._WriteFragmentsAsLine(newlines=2)
def EndParagraph(self):
self._WriteFragmentsAsLine(newlines=2)
def StartDiv(self, cls, style):
"""Process opening of a div element.
Args:
cls: The class attribute of the element.
style: The style attribute of the element.
"""
if not self._IsWithinFragmentType(FencedCodeBlock):
if self._IsWithinFragmentType(CodeBlock):
self._WriteFragmentsAsLine(newlines=1)
else:
self._WriteFragmentsAsLine(newlines=2)
if ((cls and self._ignore_class_regex.match(cls)) or
style and self._ignore_style_regex.match(style)):
self._Push(IgnoreBlock())
elif self._IsWithinFragmentType(FencedCodeBlock):
self._Push(FencedCodeBlockLine())
elif self._IsWithinFragmentType(CodeBlock):
self._Push(CodeBlock())
elif self._IsWithinFragmentType(BlockquoteBlock):
self._Push(BlockquoteBlock(indent=None))
elif cls and self._toc_class_regex.match(cls):
self._AddTableOfContents()
self._Push(IgnoreBlock()) # Ignore the items inside the Sites TOC
elif cls and self._code_class_regex.match(cls):
if FLAGS.indented_code_blocks:
self._Push(IndentedCodeBlock())
else:
self._Push(FencedCodeBlock())
else:
self._Push(WrappedTextBlock())
def EndDiv(self):
if not self._IsWithinFragmentType(FencedCodeBlock, steps_from_last=1):
if self._IsWithinFragmentType(CodeBlock, steps_from_last=1):
self._WriteFragmentsAsLine(newlines=1)
else:
self._WriteFragmentsAsLine(newlines=2)
self._Pop()
def StartHeader(self, level):
self._WriteFragmentsAsLine(newlines=2)
if level == 1 and FLAGS.underline_headers:
self._Push(UnderlinedHeader('='))
elif level == 2 and FLAGS.underline_headers:
self._Push(UnderlinedHeader('-'))
else:
self._Push(TextBlock(prefix=('#' * level) + ' '))
def EndHeader(self):
self._WriteFragmentsAsLine(newlines=2)
self._Pop()
def StartList(self, tag):
if not self._list_info_stack:
self._WriteFragmentsAsLine(newlines=2)
else:
self._WriteFragmentsAsLine(newlines=1)
self._list_info_stack.append(ListInfo(tag))
if tag == 'ol':
self._Push(Text(' ' * FLAGS.ordered_list_indent))
else:
self._Push(Text(' ' * FLAGS.unordered_list_indent))
def EndList(self):
self._list_info_stack.pop()
if not self._list_info_stack:
self._WriteFragmentsAsLine(newlines=2)
else:
self._WriteFragmentsAsLine(newlines=1)
self._Pop()
def StartListItem(self):
self._WriteFragmentsAsLine(newlines=1)
# Google Sites sometimes spits out pages with <li> tags not enclosed within
# an <ol> or <ul> tag.
tag = ''
if self._list_info_stack:
self._list_info_stack[-1].item_count += 1
tag = self._list_info_stack[-1].tag
if tag == 'ol':
item_count = self._list_info_stack[-1].item_count
# string.ljust makes room for as many digits as you need.
prefix = ('%d.' % item_count).ljust(FLAGS.ordered_list_indent)
self._Push(ListItem(prefix))
else:
prefix = '*'.ljust(FLAGS.unordered_list_indent)
self._Push(ListItem(prefix))
def EndListItem(self):
self._WriteFragmentsAsLine(newlines=1)
self._Pop()
def StartFormat(self, tag):
# Allowed formatting depends on the surrounding fragment type.
if not self._IsWithinFragmentType(IndentedCodeBlock):
formats_map = {
'i': ItalicFormattedText,
'em': ItalicFormattedText,
'b': BoldFormattedText,
'strong': BoldFormattedText,
'strike': StrikeThroughFormattedText,
's': StrikeThroughFormattedText,
'del': StrikeThroughFormattedText,
'u': HighlightFormattedText,
'code': Code,
None: Text,
}
if self._IsWithinFragmentType(FencedCodeBlock):
if FLAGS.allow_html_code_blocks:
# HTML code block can render formats but must not use Code fragments.
formats_map['code'] = formats_map[None] = CodeBlock
else:
formats_map = {None: CodeBlock}
else:
# Inside an indented code block no formatting is allowed.
formats_map = {None: CodeBlock}
self._Push(formats_map[tag]() if tag in formats_map
else formats_map[None]())
def EndFormat(self):
self._Pop()
def StartAnchor(self, href):
if href is not None:
href = self._url_translator.Translate(href)
self._Push(Link(href))
else:
self._Push(Text())
def EndAnchor(self):
self._Pop()
def StartBlockquote(self):
if not self._IsWithinFragmentType(CodeBlock):
self._WriteFragmentsAsLine(newlines=1)
self._Push(BlockquoteBlock())
else:
self._Push(Text())
def EndBlockquote(self):
if not self._IsWithinFragmentType(CodeBlock):
self._WriteFragmentsAsLine(newlines=2)
self._Pop()
def Image(self, src, alt, width, height):
src = self._url_translator.Translate(src)
self._fragment_tree.AppendFragment(Image(src, alt, width, height))
def Iframe(self, src, width, height):
"""Process an <iframe> element.
Sites use <iframe> for embedded content: Docs, Drawings, etc.
g3doc implements this by supporting <iframe> HTML tag directly.
Args:
src: Source URL.
width: Element width.
height: Element height.
"""
if False:
# TODO(dpranke): Figure out if we should support embedded IFRAME tags.
# For now, we skip over them.
self._WriteFragmentsAsLine(newlines=2)
self._Push(EmbeddedContent(src, width, height))
self._Pop()
def StartTable(self, cls):
if (cls and 'sites-layout-hbox' in cls and
'sites-layout-name-one-column' not in cls):
self._AddHTML('<div class="two-column-container">', newlines=1)
self._Push(Div('two-column-container'))
def EndTable(self):
if self._LastFragmentIs(Div, cls='two-column-container'):
self._AddHTML('</div>', newlines=1)
self._Pop()
def StartTD(self, cls):
if self._LastFragmentIs(Div, cls='two-column-container'):
if cls and 'sites-tile-name-content-1' in cls:
self._AddHTML('<div class="two-column-left">', newlines=1)
self._Push(Div('two-column-left'))
elif cls and 'sites-tile-name-content-2' in cls:
self._AddHTML('<div class="two-column-right">', newlines=1)
self._Push(Div('two-column-right'))
else:
self._Push(Text())
def EndTD(self):
if (self._LastFragmentIs(Div, 'two-column-left') or
self._LastFragmentIs(Div, 'two-column-right')):
self._AddHTML('</div>', newlines=1)
self._Pop()
def Text(self, text):
if not isinstance(self._fragment_tree.PeekFragmentFromEnd(), IgnoreBlock):
fragment = (CodeBlock() if self._IsWithinFragmentType(CodeBlock)
else Text())
self._fragment_tree.AppendFragment(fragment)
fragment.Append(text)
def _AddTableOfContents(self):
# TOC must be preceded and followed by a blank line
self._AddVerticallyPaddedParagraph('[TOC]')
def _AddVerticallyPaddedParagraph(self, text):
self._WriteFragmentsAsLine(newlines=2)
fragment = CodeBlock() # Use CodeBlock to prevent escaping
self._fragment_tree.AppendFragment(fragment)
fragment.Append(text)
self._WriteFragmentsAsLine(newlines=2)
def _AddHTML(self, html, newlines):
"""Writes out a block of html followed by |newlines-1| blank lines."""
fragment = HTML()
fragment.Append(html)
self._fragment_tree.AppendFragment(fragment)
self._WriteFragmentsAsLine(newlines)
def _WriteFragmentsAsLine(self, newlines):
"""Writes out any content currently buffered in the fragment stack.
Args:
newlines: The minimum number of newlines required in the output after this
line. These newlines won't be written out until the next line with
content is encountered.
"""
# Generate indent and the content, then clear content in fragments.
indent = io.StringIO()
self._fragment_tree.ApplyToAllFragments(
lambda fragment: fragment.WriteIndent(indent),
lambda fragment: fragment.WriteContentIntoParentAndClear())
last_fragment = self._fragment_tree.PeekFragmentFromEnd()
content = self._fragment_tree.PeekFragmentFromStart().ConsumeContent()
content = last_fragment.StripLine(content.getvalue())
indent = indent.getvalue()
content = last_fragment.WrapLine(content, indent)
# Write the content, if any.
if content:
self._out.write('\n' * self._pending_newlines)
self._out.write(indent)
self._out.write(content)
self._pending_newlines = newlines
elif self._pending_newlines > 0 and self._pending_newlines < newlines:
self._pending_newlines = newlines
if FLAGS.debug_print_tree:
# Separate trees printed during each writing session
print('-' * 20)
class XhtmlHandler(xml.sax.ContentHandler):
"""Translates SAX events into MarkdownGenerator calls."""
# regex that matches an HTML header tag and extracts the level.
_HEADER_TAG_RE = re.compile(r'h([1-6])$')
def __init__(self, out, url_translator):
xml.sax.ContentHandler.__init__(self)
self._generator = MarkdownGenerator(out, url_translator)
def startDocument(self):
self._generator.StartDocument()
def endDocument(self):
self._generator.EndDocument()
def startElementNS(self, name, qname, attrs):
tag = name[1]
if tag == 'a':
href = attrs.get((None, 'href'))
self._generator.StartAnchor(href)
elif tag == 'br':
self._generator.Break()
elif tag == 'hr':
self._generator.HorizontalRule()
elif tag == 'li':
self._generator.StartListItem()
elif tag == 'div':
cls = attrs.get((None, 'class'))
style = attrs.get((None, 'style'))
self._generator.StartDiv(cls, style)
elif tag == 'p':
self._generator.StartParagraph()
elif tag in ('b', 'code', 'em', 'i', 'strong', 's', 'strike', 'del', 'u'):
self._generator.StartFormat(tag)
elif tag in ('ul', 'ol'):
self._generator.StartList(tag)
elif tag == 'img':
src = attrs.get((None, 'src'))
alt = attrs.get((None, 'alt'))
width = attrs.get((None, 'width'))
height = attrs.get((None, 'height'))
self._generator.Image(src, alt, width, height)
elif tag == 'blockquote':
self._generator.StartBlockquote()
elif tag == 'iframe':
src = attrs.get((None, 'src'))
width = attrs.get((None, 'width'))
height = attrs.get((None, 'height'))
self._generator.Iframe(src, width, height)
elif tag == 'table':
cls = attrs.get((None, 'class'))
self._generator.StartTable(cls)
elif tag == 'td':
self._generator.StartTD(attrs.get((None, 'class')))
else:
match = self._HEADER_TAG_RE.match(tag)
if match:
level = int(match.group(1))
self._generator.StartHeader(level)
def endElementNS(self, name, qname):
tag = name[1]
if tag == 'a':
self._generator.EndAnchor()
elif tag == 'li':
self._generator.EndListItem()
elif tag == 'div':
self._generator.EndDiv()
elif tag == 'p':
self._generator.EndParagraph()
elif tag in ('b', 'code', 'em', 'i', 'strong', 's', 'strike', 'del', 'u'):
self._generator.EndFormat()
elif tag in ('ul', 'ol'):
self._generator.EndList()
elif tag == 'blockquote':
self._generator.EndBlockquote()
elif tag == 'td':
self._generator.EndTD()
elif tag == 'table':
self._generator.EndTable()
else:
match = self._HEADER_TAG_RE.match(tag)
if match:
self._generator.EndHeader()
def characters(self, content):
self._generator.Text(content)
class DefaultUrlTranslator:
"""No-op UrlTranslator."""
def Translate(self, href):
return href
def Convert(input_stream, output_stream, url_translator=DefaultUrlTranslator()):
"""Converts an input stream of xhtml into an output stream of markdown.
Args:
input_stream: filehandle for the XHTML input.
output_stream: filehandle for the Markdown output.
url_translator: Callback for translating URLs embedded in the page.
"""
parser = xml.sax.make_parser()
parser.setContentHandler(XhtmlHandler(output_stream, url_translator))
parser.setFeature(xml.sax.handler.feature_namespaces, 1)
parser.parse(input_stream)