| # Copyright 2021 Google LLC |
| # |
| # Licensed under the Apache License, Version 2.0 (the "License"); |
| # you may not use this file except in compliance with the License. |
| # You may obtain a copy of the License at |
| # |
| # https://www.apache.org/licenses/LICENSE-2.0 |
| # |
| # Unless required by applicable law or agreed to in writing, software |
| # distributed under the License is distributed on an "AS IS" BASIS, |
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| # See the License for the specific language governing permissions and |
| # limitations under the License. |
| |
| """HTML to Markdown renderer.""" |
| |
| import os |
| import re |
| import io |
| import textwrap |
| import urllib |
| import xml.sax |
| |
| |
| class _Flags: |
| # Whether to render h1s and h2s with underlined - and =. |
| underline_headers = False |
| |
| # The set of characters to escape with \'\\\' in the |
| # Markdown. This is not the set of all special Markdown |
| # characters, but rather those characters that tend to |
| # get misinterpreted as Markdown syntax the most. Blindly |
| # escaping all special Markdown characters results in ugly |
| # Markdown. |
| escape_chars = r'\`*[]' |
| |
| # Format for italic tags. |
| italic_format = '*' |
| |
| # Format for bold tags. |
| bold_format = '**' |
| |
| # Format for strikethrough tags. |
| strike_format = '~~' |
| |
| # Format for underline tags. |
| highlight_format = '==' |
| |
| # Number of spaces to indent an unordered list. |
| # This total includes the bullet. |
| # For example, a value of 4 yields '* ' |
| unordered_list_indent = 4 |
| |
| # Number of spaces to indent an ordered list. |
| # This total includes the number. |
| # For example, a value of 4 yields '1. ' |
| ordered_list_indent = 4 |
| |
| # The DIV blocks that should be formatted as code. |
| code_class_regex = r'^sites-codeblock sites-codesnippet-block$' |
| |
| # The class of DIV blocks used for table of contents. |
| toc_class_regex = r'^sites-embed-content sites-embed-type-toc$' |
| |
| # The class of DIV blocks that should be ignored. |
| ignore_class_regex = r'' |
| |
| # The style of DIV blocks that should be ignored. |
| ignore_style_regex = r'^display:none;$' |
| |
| # Format text blocks to the given line width. Set to zero |
| # to disable line wrapping. |
| line_width = 80 |
| |
| # Whether to use indented code blocks, if False use fenced. |
| indented_code_blocks = False |
| |
| # Whether to use HTML code blocks instead of fenced code |
| # blocks if source code block includes formatted text. |
| allow_html_code_blocks = True |
| |
| # Links that are automatically recognized by the renderer. |
| shortlinks_regex = r'^http://(ag|b|cl|g|go|who)/' |
| |
| # Print the fragment tree for debugging. |
| debug_print_tree = False |
| |
| |
| FLAGS = _Flags() |
| |
| |
| def _EscapeText(text, reserved_chars): |
| """Escapes any reserved characters with a backslash. |
| |
| Args: |
| text: The string to escape. |
| reserved_chars: A string of reserved characters that need to be escaped. |
| |
| Returns: |
| The escaped text. |
| """ |
| markdown = io.StringIO() |
| for c in text: |
| if c in reserved_chars: |
| markdown.write('\\') |
| markdown.write(c) |
| return markdown.getvalue() |
| |
| |
| def _EscapeContentForHtml(text): |
| result = io.StringIO() |
| escapes = {'<': '<', '>': '>'} |
| for c in text: |
| result.write(c if c not in escapes else escapes[c]) |
| return result |
| |
| |
| ENCODED_NEWLINE = '&#%d;' % ord('\n') |
| |
| |
| def _RestoreEncodedNewlines(text): |
| return text.replace(ENCODED_NEWLINE, '\n') |
| |
| |
| def _WrapLine(line, indent): |
| """Wraps the line to fit into the column limit. |
| |
| Args: |
| line: The string to wrap. |
| indent: An integer with the number of columns of indentation. |
| |
| Returns: |
| The wrapped text. |
| """ |
| if FLAGS.line_width > 0: |
| return ('\n' + ' ' * indent).join(textwrap.wrap( |
| line, |
| width=FLAGS.line_width - indent, |
| break_long_words=False, |
| break_on_hyphens=False)) |
| return line |
| |
| |
| class Fragment: |
| """Base class for all output fragments. |
| |
| To generate a line of output, the methods will be called in the following |
| order: |
| |
| WriteIndent() |
| WriteContentIntoParentAndClear() |
| ConsumeContent() -- for the topmost fragment only |
| StripLine() |
| WrapLine() |
| """ |
| |
| def __init__(self, indent, prefix, suffix): |
| self._content = io.StringIO() |
| self._indent = indent |
| self._prefix = prefix |
| self._suffix = suffix |
| self._parent = None |
| self._children = [] |
| |
| def __repr__(self): |
| debug_print = lambda text: text.encode('utf-8') if text else '' |
| return ('{' + |
| self.__class__.__name__ + |
| ': indent=' + debug_print(self._indent) + |
| '; prefix=' + debug_print(self._prefix) + |
| '; content=' + debug_print(self._content.getvalue()) + |
| '; suffix=' + debug_print(self._suffix) + |
| '}') |
| |
| def SetParent(self, parent): |
| self._parent = parent |
| |
| def AddChild(self, node): |
| self._children.append(node) |
| node.SetParent(self) |
| return node |
| |
| def GetChildren(self): |
| return self._children |
| |
| def _AllChildren(self): |
| all_children = [] |
| def Traverse(fragment): |
| for c in fragment.GetChildren(): |
| all_children.append(c) |
| Traverse(c) |
| Traverse(self) |
| return all_children |
| |
| def WriteIndent(self, output): |
| if self._indent: |
| output.write(self._indent) |
| |
| def WriteContentIntoParentAndClear(self): |
| self._WriteContent(self._parent._content) # pylint: disable=protected-access |
| self._ClearContent() |
| self._children = [] |
| |
| def _WriteContent(self, output): |
| """Implementation of content rendering. Can be overridden in subclasses.""" |
| self._Write(output, self._prefix, self._content.getvalue(), self._suffix) |
| |
| def _Write(self, output, prefix, content, suffix): |
| """Default implementation of content rendering for reuse by subclasses.""" |
| has_content = bool(content.strip()) |
| if prefix and has_content: |
| output.write(prefix) |
| output.write(content) |
| if suffix and has_content: |
| output.write(suffix) |
| |
| def UnsetSuffix(self): |
| self._suffix = '' |
| |
| def UnsetPrefix(self): |
| self._prefix = '' |
| |
| def _UpdatePrefixAndSuffix(self, prefix, suffix): |
| if self._prefix: |
| self._prefix = prefix |
| if self._suffix: |
| self._suffix = suffix |
| |
| def _ClearContent(self): |
| """Clears the content. This will only be called after it's been written.""" |
| self._content = io.StringIO() |
| |
| def ConsumeContent(self): |
| content = self._content |
| self._ClearContent() |
| return content |
| |
| def Append(self, text): |
| """Appends text. |
| |
| Args: |
| text: The string to append, it will be escaped. |
| """ |
| assert isinstance(text, str) |
| self._content.write(self.EscapeText(text)) |
| |
| def EscapeText(self, text): |
| """Escapes any reserved characters when Append() is called with text. |
| |
| By default this defers to the parent fragment. |
| |
| Args: |
| text: The string to escape. |
| |
| Returns: |
| The escaped string. |
| """ |
| if self._parent: |
| return self._parent.EscapeText(text) |
| return text |
| |
| def StripLine(self, text): |
| """Does any needed stripping of whitespace. |
| |
| Some blocks (code for example) will want to preserve whitespace, while |
| others will want to coalesce it together. By default this defers to the |
| parent fragment. |
| |
| Args: |
| text: The string to strip |
| |
| Returns: |
| The stripped string. |
| """ |
| if self._parent: |
| return self._parent.StripLine(text) |
| return text |
| |
| def WrapLine(self, line, indent): |
| """Wraps the line to fit into the column limit, if necessary. |
| |
| Most blocks (code for example) will want to preserve whitespace and won't |
| break their output. |
| |
| Args: |
| text: The string to wrap. |
| indent: Indent string. |
| Returns: |
| The wrapped string. |
| """ |
| del indent |
| return line |
| |
| def NeedsToMergeWith(self, text): |
| del text |
| return False |
| |
| |
| class Text(Fragment): |
| """Markdown fragment that consists of just a string.""" |
| |
| def __init__(self, indent=None, prefix=None, suffix=None): |
| super().__init__(indent, prefix, suffix) |
| |
| |
| class IgnoreBlock(Fragment): |
| """Markdown fragment that omits all content.""" |
| |
| def __init__(self): |
| super().__init__(None, None, None) |
| |
| |
| class TextBlock(Text): |
| """A TextBlock coalesces all spaces and escapes all reserved chars.""" |
| |
| def EscapeText(self, text): |
| return _EscapeText(text, FLAGS.escape_chars) |
| |
| def StripLine(self, text): |
| # Treat newlines as spaces and then coalesce spaces. |
| text = text.replace('\n', ' ') |
| # Replace all Unicode nonbreaking spaces with simple spaces. This is safer |
| # than deletion since spaces are coalesced below anyway. |
| text = text.replace(chr(160), ' ') |
| |
| return re.sub(r' +', ' ', text.strip()) |
| |
| |
| class WrappedTextBlock(TextBlock): |
| """A WrappedTextBlock wraps the output lines to fit into the column limit.""" |
| |
| def WrapLine(self, line, indent): |
| return _WrapLine(line, len(indent)) |
| |
| |
| class BlockquoteBlock(WrappedTextBlock): |
| """A BlockquoteBlock wraps content and prepends each line with '> '. |
| |
| The generator must emit BlockquoteBlocks with no indent for paragraphs |
| inside a blockquote. This will allow propagating the final call to WrapLine |
| up to the outermost BlockquoteBlock which will wrap the lines and prepend |
| each of them with the indent. |
| """ |
| |
| def __init__(self, indent='> '): |
| super().__init__(indent, None, None) |
| |
| def WrapLine(self, line, indent): |
| if not self._indent and self._parent: |
| return self._parent.WrapLine(line, indent) |
| wrapped = _WrapLine(line, len(indent)) |
| lines = wrapped.splitlines(True) |
| return indent.join([l.lstrip() for l in lines]) |
| |
| |
| class CodeBlock(Text): |
| """Base class for different code block fragment implementations.""" |
| |
| def EscapeText(self, text): |
| return text |
| |
| def StripLine(self, text): |
| # Completely ignore newlines in code blocks. Sites always uses <br/>. |
| return text.replace('\n', '') |
| |
| def ChangeToHtml(self): |
| content = self._content.getvalue() |
| if content: |
| self._content = _EscapeContentForHtml(content) |
| |
| |
| class IndentedCodeBlock(CodeBlock): |
| """A IndentedCodeBlock indents by four spaces.""" |
| |
| def __init__(self, indent=' '): |
| super().__init__(indent, None, None) |
| |
| |
| class FencedCodeBlock(CodeBlock): |
| """A FencedCodeBlock is fenced with triple backticks (```). |
| |
| To render correctly, content writing must not happen |
| unless the end of the source code block has been encountered. |
| That is, the entire code block from the source HTML must |
| be rendered in a single write pass. |
| """ |
| |
| def __init__(self, indent=None, |
| prefix='```none' + ENCODED_NEWLINE, |
| suffix=ENCODED_NEWLINE + '```'): |
| super().__init__(indent, prefix, suffix) |
| |
| def WriteIndent(self, output): |
| # Adjust inner fragments and self before rendering. |
| if FLAGS.allow_html_code_blocks: |
| has_formatted_text = False |
| for c in self._AllChildren(): |
| if isinstance(c, FormattedText): |
| c.ChangeToHtml() |
| has_formatted_text = True |
| if has_formatted_text: |
| for c in self._AllChildren(): |
| if isinstance(c, CodeBlock): |
| c.ChangeToHtml() |
| self._UpdatePrefixAndSuffix( |
| '<pre><code>', ENCODED_NEWLINE + '</code></pre>') |
| super().WriteIndent(output) |
| |
| def StripLine(self, text): |
| text = super().StripLine(text) |
| lines = _RestoreEncodedNewlines(text).splitlines() |
| return '\n'.join([l for l in lines if l]) |
| |
| def WrapLine(self, line, indent): |
| lines = line.splitlines(True) |
| return indent.join(lines) |
| |
| |
| class FencedCodeBlockLine(Text): |
| """A line of code inside FencedCodeBlock.""" |
| |
| def __init__(self, indent=None, |
| prefix=ENCODED_NEWLINE, suffix=ENCODED_NEWLINE): |
| super().__init__(indent, prefix, suffix) |
| |
| def StripLine(self, text): |
| text = super().StripLine(text) |
| return _RestoreEncodedNewlines(text) |
| |
| |
| class UnderlinedHeader(TextBlock): |
| """Markdown fragment for an underlined section header.""" |
| |
| def __init__(self, char): |
| super().__init__() |
| self._char = char |
| |
| def _WriteContent(self, output): |
| length = len(self.StripLine(self._content.getvalue())) |
| if length > 0: |
| # '\n' will be stripped, so use an encoded '\n' that we can later replace |
| # after the line is stripped. |
| self._Write(output, |
| None, |
| self._content.getvalue(), |
| ENCODED_NEWLINE + self._char * length) |
| |
| def StripLine(self, text): |
| text = super().StripLine(text) |
| return _RestoreEncodedNewlines(text) |
| |
| |
| class FormattedText(Text): |
| """Text wrapped in Markdown formatting.""" |
| |
| def __init__(self, fmt): |
| super().__init__(None, fmt, fmt) |
| |
| def _Pad(self, bigger, smaller): |
| return ' ' * (len(bigger) - len(smaller)) |
| |
| def _WriteContent(self, output): |
| prefix = self._prefix |
| content = self._content.getvalue() |
| suffix = self._suffix |
| if prefix: |
| # If there are whitespaces immediately after the prefix, |
| # they must be pushed out before the prefix. |
| lstripped = content.lstrip() |
| if len(content) > len(lstripped): |
| prefix = self._Pad(content, lstripped) + prefix |
| content = lstripped |
| if suffix: |
| # If there are whitespaces immediately before the suffix, |
| # they must be pushed out after the suffix. |
| rstripped = content.rstrip() |
| if len(content) > len(rstripped): |
| suffix = suffix + self._Pad(content, rstripped) |
| content = rstripped |
| self._Write(output, prefix, content, suffix) |
| |
| def ChangeToHtml(self): |
| content = self._content.getvalue() |
| if content: |
| content = _EscapeContentForHtml(content) |
| |
| |
| class BoldFormattedText(FormattedText): |
| """Text formatted as bold.""" |
| |
| def __init__(self): |
| super().__init__(FLAGS.bold_format) |
| |
| def NeedsToMergeWith(self, text): |
| return isinstance(text, BoldFormattedText) |
| |
| def ChangeToHtml(self): |
| super().ChangeToHtml() |
| self._UpdatePrefixAndSuffix('<b>', '</b>') |
| |
| |
| class ItalicFormattedText(FormattedText): |
| """Text formatted as italic.""" |
| |
| def __init__(self): |
| super().__init__(FLAGS.italic_format) |
| |
| def NeedsToMergeWith(self, text): |
| return isinstance(text, ItalicFormattedText) |
| |
| def ChangeToHtml(self): |
| super().ChangeToHtml() |
| self._UpdatePrefixAndSuffix('<i>', '</i>') |
| |
| |
| class StrikeThroughFormattedText(FormattedText): |
| """Text formatted as strike through.""" |
| |
| def __init__(self): |
| super().__init__(FLAGS.strike_format) |
| |
| def NeedsToMergeWith(self, text): |
| return isinstance(text, StrikeThroughFormattedText) |
| |
| def ChangeToHtml(self): |
| super().ChangeToHtml() |
| self._UpdatePrefixAndSuffix('<s>', '</s>') |
| |
| |
| class HighlightFormattedText(FormattedText): |
| """Highlighted text.""" |
| |
| def __init__(self): |
| super().__init__(FLAGS.highlight_format) |
| |
| def NeedsToMergeWith(self, text): |
| return isinstance(text, HighlightFormattedText) |
| |
| def ChangeToHtml(self): |
| super().ChangeToHtml() |
| self._UpdatePrefixAndSuffix('<u>', '</u>') |
| |
| |
| class ListItem(Text): |
| """Item in a list.""" |
| |
| def __init__(self, bullet): |
| super().__init__() |
| self._bullet = bullet |
| |
| def WriteIndent(self, output): |
| if self._bullet: |
| # TODO(dpranke): The original code relied on strings and bytes |
| # being interchangeable in Python2, so you could seek backwards |
| # from the current location with a relative offset. You can't |
| # do that in Python3, apparently. |
| # |
| # To get around this for the moment, instead of seeking backwards |
| # 4 characters, we embed 4 '\b' backspaces, and then have the client |
| # do a global search and replace of ' \b\b\b\b' with '' instead. |
| # |
| # This is awkward, so we should rework this so that this isn't needed. |
| # |
| # output.seek(-len(self._bullet), os.SEEK_CUR) |
| output.write('\b' * len(self._bullet)) |
| output.write(self._bullet) |
| super().WriteIndent(output) |
| |
| def _ClearContent(self): |
| self._bullet = None |
| super()._ClearContent() |
| |
| def WrapLine(self, line, indent): |
| return _WrapLine(line, len(indent)) |
| |
| |
| class Link(Text): |
| """Markdown link.""" |
| |
| def __init__(self, href): |
| super().__init__() |
| self._href = href |
| self._url_opener_prefix = '' |
| self._url_opener_suffix = '' |
| |
| def MakeAnImage(self, width, height): |
| self._url_opener_prefix = '!' |
| if width and height: |
| self._url_opener_suffix = ( |
| '{{width="{}" height="{}"}}'.format(width, height)) |
| |
| def _IsShortLink(self, text): |
| if FLAGS.shortlinks_regex and ( |
| re.compile(FLAGS.shortlinks_regex).match(self._href)): |
| parsed_href = urllib.parse.urlsplit(self._href) |
| if parsed_href.netloc + parsed_href.path == text: |
| return True |
| return None |
| |
| def _WriteLink(self, output, text): |
| write_short_link = (not (self._url_opener_prefix or self._url_opener_suffix) |
| and self._IsShortLink(text)) |
| if write_short_link: |
| self._Write(output, None, text, None) |
| else: |
| self._Write(output, |
| self._url_opener_prefix + '[', |
| text, |
| '](' + self._href + ')' + self._url_opener_suffix) |
| |
| def _WriteContent(self, output): |
| text = self._content.getvalue() |
| if text: |
| if text.startswith('http://') or text.startswith('https://'): |
| self._Write(output, '<', text, '>') |
| else: |
| self._WriteLink(output, text) |
| |
| |
| class Image(Text): |
| """Image.""" |
| |
| def __init__(self, src, alt, width, height): |
| super().__init__() |
| self._src = src |
| self._alt = alt or 'image' |
| self._width = width |
| self._height = height |
| |
| def _WriteContent(self, output): |
| self._Write(output, '') |
| |
| |
| class Code(Text): |
| """Inline code.""" |
| |
| def __init__(self): |
| super().__init__(None, '`', '`') |
| |
| def EscapeText(self, text): |
| return text |
| |
| def _WriteContent(self, output): |
| prefix = self._prefix |
| content = self._content.getvalue() |
| suffix = self._suffix |
| if '`' in content: |
| # If a backtick (`) is present inside inline code, the fragment |
| # must use double backticks. |
| prefix = suffix = '``' |
| # Since having content starting or ending with a backtick would emit |
| # triple backticks which designates a fenced code fragment, pad content |
| # to avoid this. |
| if content.startswith('`'): |
| content = ' ' + content |
| if content.endswith('`'): |
| content += ' ' |
| self._Write(output, prefix, content, suffix) |
| |
| def NeedsToMergeWith(self, text): |
| return isinstance(text, Code) |
| |
| |
| class EmbeddedContent(Text): |
| """Embedded content: Docs, Drawings, Presentations, etc.""" |
| |
| def __init__(self, href, width, height): |
| super().__init__() |
| self._href = href |
| self._width = width |
| self._height = height |
| |
| def _WriteContent(self, output): |
| parsed_href = urllib.parse.urlsplit(self._href) |
| if parsed_href.scheme == 'http': |
| parsed_href = urllib.parse.SplitResult( |
| 'https', parsed_href.netloc, parsed_href.path, parsed_href.query, |
| parsed_href.fragment) |
| # Note: 'allow="fullscreen"' is requested for all content for simplicity. |
| # g3doc server has dedicated logic to deal with these requests. |
| element = '<iframe src="{}"{} allow="fullscreen" />'.format( |
| urllib.parse.urlunsplit(parsed_href), |
| (' width="{}" height="{}"'.format(self._width, self._height) if ( |
| self._width and self._height) else '')) |
| self._Write(output, None, element, None) |
| |
| |
| class ListInfo: |
| |
| def __init__(self, tag): |
| self.tag = tag # The tag used to start the list |
| self.item_count = 0 # The number of items in the list |
| |
| |
| class FragmentTree: |
| """Class for managing a tree of fragments. |
| |
| There is a "scope" formed by nested fragments, e.g. |
| italic fragment inside bold fragment inside paragraph. |
| The scope is stored in the stack. For convenience, |
| the stack always have one element. |
| |
| Fragments popped out from the scope may be re-added |
| back into the tree as children of the last fragment. |
| This allows "chaining" of structured content for future |
| processing. For example, if there were several bold |
| fragments inside a paragraph interleaved with fragments |
| of regular text, all these fragments will end up as |
| children of the paragraph fragment. |
| |
| """ |
| |
| def __init__(self, top_fragment): |
| self._stack = [top_fragment] |
| |
| def ActiveFragmentScopeDepth(self): |
| return len(self._stack) - 1 |
| |
| def StartFragment(self, fragment): |
| fragment.SetParent(self._stack[-1]) |
| self._stack.append(fragment) |
| return fragment |
| |
| def EndFragment(self): |
| return self._stack.pop() |
| |
| def AppendFragment(self, fragment): |
| return self._stack[-1].AddChild(fragment) |
| |
| def _ApplyRecursivelyToNode(self, node, scope_operation, operation, # pylint: disable=missing-docstring |
| debug_indent): |
| if not debug_indent: |
| for child in node.GetChildren(): |
| self._ApplyRecursivelyToNode(child, scope_operation, operation, None) |
| else: |
| debug_indent += ' c ' |
| for child in node.GetChildren(): |
| print(debug_indent + repr(child)) |
| self._ApplyRecursivelyToNode(child, scope_operation, operation, |
| debug_indent) |
| operation(node) |
| |
| def _ApplyRecursivelyToScope(self, nodes, scope_operation, operation, # pylint: disable=missing-docstring |
| debug_indent): |
| node = nodes.pop() |
| scope_operation(node) |
| if debug_indent: |
| print(debug_indent + repr(node)) |
| if nodes: |
| self._ApplyRecursivelyToScope(nodes, scope_operation, operation, |
| (debug_indent + ' s ' if debug_indent |
| else None)) |
| self._ApplyRecursivelyToNode(node, scope_operation, operation, |
| debug_indent) |
| |
| def ApplyToAllFragments(self, scope_operation, operation): |
| """Recursively applies operations to all fragments in the tree. |
| |
| The omnipresent topmost fragment is excluded. The 'scope_operation' |
| is applied to every element in the fragment stack in pre-order. |
| The 'operation' is applied to all fragments in the tree in post-order. |
| |
| Args: |
| scope_operation: The operation to apply to fragments in the scope stack. |
| operation: The operation to apply to all fragments in the tree. |
| """ |
| self._ApplyRecursivelyToScope(list(reversed(self._stack[1:])), |
| scope_operation, operation, |
| ' ' if FLAGS.debug_print_tree else None) |
| |
| def FindFirstFragmentFromEnd(self, predicate, steps_from_last=0): |
| sub_stack = self._stack[:-steps_from_last if steps_from_last else None] |
| return next((node for node in sub_stack if predicate(node)), None) |
| |
| def PeekFragmentFromStart(self, steps_from_first=0): |
| return self._stack[steps_from_first] |
| |
| def PeekFragmentFromEnd(self, steps_from_last=0): |
| return self._stack[-(steps_from_last + 1)] |
| |
| def PeekLastAppendedFragment(self): |
| return (self._stack[-1].GetChildren()[-1] |
| if self._stack[-1].GetChildren() else None) |
| |
| |
| class MarkdownGenerator: |
| """Generates Markdown based on the series of HTML tags seen. |
| |
| Each time an opening HTML tag is seen, the appropriate markdown fragment is |
| created and pushed onto a stack. Any text encountered is appended to the |
| fragment at the top of the stack. When a closing HTML tag is seen, the stack |
| is popped and the fragment removed is appended to the new top of the stack. |
| |
| Markdown is buffered in the fragment stack until an entire line has been |
| formed, at which point _WriteFragmentsAsLine() is called to write it out. The |
| content buffered in the stack is cleared, but otherwise the stack remains |
| unmodified. |
| """ |
| |
| def __init__(self, out, url_translator): |
| self._out = out |
| self._url_translator = url_translator |
| self._fragment_tree = FragmentTree(Text()) |
| self._list_info_stack = [] |
| self._pending_newlines = 0 |
| # Initialize the regexps to match nothing (rather than be None). |
| self._code_class_regex = re.compile(FLAGS.code_class_regex or 'a^') |
| self._toc_class_regex = re.compile(FLAGS.toc_class_regex or 'a^') |
| self._ignore_class_regex = re.compile(FLAGS.ignore_class_regex or 'a^') |
| self._ignore_style_regex = re.compile(FLAGS.ignore_style_regex or 'a^') |
| |
| def _Push(self, fragment): |
| """Sets the parent fragment and pushes it onto the fragment stack. |
| |
| In the case where there is an IgnoreBlock on the stack, a new IgnoreBlock |
| is pushed instead. |
| |
| Args: |
| fragment: The Fragment object to push on the stack. |
| """ |
| if isinstance(self._fragment_tree.PeekFragmentFromEnd(), IgnoreBlock): |
| # If the top of the stack is IgnoreBlock, push an IgnoreBlock instead. |
| fragment = IgnoreBlock() |
| else: |
| # Check if we need to merge adjacent formatting, e.g. |
| # instead of **bold****bold** we need to write **boldbold**, |
| # as the former is not correct Markdown syntax. |
| last_appended = self._fragment_tree.PeekLastAppendedFragment() |
| if last_appended and last_appended.NeedsToMergeWith(fragment): |
| last_appended.UnsetSuffix() |
| fragment.UnsetPrefix() |
| |
| self._fragment_tree.StartFragment(fragment) |
| |
| def _Pop(self): |
| """Pops the fragment stack it to the new top of stack. |
| |
| If the fragment stack would be empty after popping, then the fragment is |
| written to the output first. |
| """ |
| if self._fragment_tree.ActiveFragmentScopeDepth() > 1: |
| fragment = self._fragment_tree.EndFragment() |
| self._fragment_tree.AppendFragment(fragment) |
| else: |
| self._WriteFragmentsAsLine(newlines=0) |
| self._fragment_tree.EndFragment() |
| |
| def _IsWithinFragmentType(self, fragment_type, steps_from_last=0): |
| return self._fragment_tree.FindFirstFragmentFromEnd( |
| lambda fragment: isinstance(fragment, fragment_type), |
| steps_from_last) is not None |
| |
| def Break(self): |
| if not self._IsWithinFragmentType(FencedCodeBlock): |
| self._WriteFragmentsAsLine(newlines=1) |
| else: |
| fragment = FencedCodeBlockLine(prefix='', suffix='') |
| self._Push(fragment) |
| fragment.Append(ENCODED_NEWLINE) |
| self._Pop() |
| |
| def HorizontalRule(self): |
| # Horizontal rule must be preceded and followed by a blank line |
| self._AddVerticallyPaddedParagraph('---') |
| |
| def StartDocument(self): |
| self._Push(WrappedTextBlock()) |
| |
| def EndDocument(self): |
| self._Pop() |
| |
| def StartParagraph(self): |
| self._WriteFragmentsAsLine(newlines=2) |
| |
| def EndParagraph(self): |
| self._WriteFragmentsAsLine(newlines=2) |
| |
| def StartDiv(self, cls, style): |
| """Process opening of a div element. |
| |
| Args: |
| cls: The class attribute of the element. |
| style: The style attribute of the element. |
| """ |
| if not self._IsWithinFragmentType(FencedCodeBlock): |
| if self._IsWithinFragmentType(CodeBlock): |
| self._WriteFragmentsAsLine(newlines=1) |
| else: |
| self._WriteFragmentsAsLine(newlines=2) |
| |
| if ((cls and self._ignore_class_regex.match(cls)) or |
| style and self._ignore_style_regex.match(style)): |
| self._Push(IgnoreBlock()) |
| elif self._IsWithinFragmentType(FencedCodeBlock): |
| self._Push(FencedCodeBlockLine()) |
| elif self._IsWithinFragmentType(CodeBlock): |
| self._Push(CodeBlock()) |
| elif self._IsWithinFragmentType(BlockquoteBlock): |
| self._Push(BlockquoteBlock(indent=None)) |
| elif cls and self._toc_class_regex.match(cls): |
| self._AddTableOfContents() |
| self._Push(IgnoreBlock()) # Ignore the items inside the Sites TOC |
| elif cls and self._code_class_regex.match(cls): |
| if FLAGS.indented_code_blocks: |
| self._Push(IndentedCodeBlock()) |
| else: |
| self._Push(FencedCodeBlock()) |
| else: |
| self._Push(WrappedTextBlock()) |
| |
| def EndDiv(self): |
| if not self._IsWithinFragmentType(FencedCodeBlock, steps_from_last=1): |
| if self._IsWithinFragmentType(CodeBlock, steps_from_last=1): |
| self._WriteFragmentsAsLine(newlines=1) |
| else: |
| self._WriteFragmentsAsLine(newlines=2) |
| self._Pop() |
| |
| def StartHeader(self, level): |
| self._WriteFragmentsAsLine(newlines=2) |
| if level == 1 and FLAGS.underline_headers: |
| self._Push(UnderlinedHeader('=')) |
| elif level == 2 and FLAGS.underline_headers: |
| self._Push(UnderlinedHeader('-')) |
| else: |
| self._Push(TextBlock(prefix=('#' * level) + ' ')) |
| |
| def EndHeader(self): |
| self._WriteFragmentsAsLine(newlines=2) |
| self._Pop() |
| |
| def StartList(self, tag): |
| if not self._list_info_stack: |
| self._WriteFragmentsAsLine(newlines=2) |
| else: |
| self._WriteFragmentsAsLine(newlines=1) |
| self._list_info_stack.append(ListInfo(tag)) |
| if tag == 'ol': |
| self._Push(Text(' ' * FLAGS.ordered_list_indent)) |
| else: |
| self._Push(Text(' ' * FLAGS.unordered_list_indent)) |
| |
| def EndList(self): |
| self._list_info_stack.pop() |
| if not self._list_info_stack: |
| self._WriteFragmentsAsLine(newlines=2) |
| else: |
| self._WriteFragmentsAsLine(newlines=1) |
| self._Pop() |
| |
| def StartListItem(self): |
| self._WriteFragmentsAsLine(newlines=1) |
| # Google Sites sometimes spits out pages with <li> tags not enclosed within |
| # an <ol> or <ul> tag. |
| tag = '' |
| if self._list_info_stack: |
| self._list_info_stack[-1].item_count += 1 |
| tag = self._list_info_stack[-1].tag |
| if tag == 'ol': |
| item_count = self._list_info_stack[-1].item_count |
| # string.ljust makes room for as many digits as you need. |
| prefix = ('%d.' % item_count).ljust(FLAGS.ordered_list_indent) |
| self._Push(ListItem(prefix)) |
| else: |
| prefix = '*'.ljust(FLAGS.unordered_list_indent) |
| self._Push(ListItem(prefix)) |
| |
| def EndListItem(self): |
| self._WriteFragmentsAsLine(newlines=1) |
| self._Pop() |
| |
| def StartFormat(self, tag): |
| # Allowed formatting depends on the surrounding fragment type. |
| if not self._IsWithinFragmentType(IndentedCodeBlock): |
| formats_map = { |
| 'i': ItalicFormattedText, |
| 'em': ItalicFormattedText, |
| 'b': BoldFormattedText, |
| 'strong': BoldFormattedText, |
| 'strike': StrikeThroughFormattedText, |
| 's': StrikeThroughFormattedText, |
| 'del': StrikeThroughFormattedText, |
| 'u': HighlightFormattedText, |
| 'code': Code, |
| None: Text, |
| } |
| if self._IsWithinFragmentType(FencedCodeBlock): |
| if FLAGS.allow_html_code_blocks: |
| # HTML code block can render formats but must not use Code fragments. |
| formats_map['code'] = formats_map[None] = CodeBlock |
| else: |
| formats_map = {None: CodeBlock} |
| else: |
| # Inside an indented code block no formatting is allowed. |
| formats_map = {None: CodeBlock} |
| self._Push(formats_map[tag]() if tag in formats_map |
| else formats_map[None]()) |
| |
| def EndFormat(self): |
| self._Pop() |
| |
| def StartAnchor(self, href): |
| if href is not None: |
| href = self._url_translator.Translate(href) |
| self._Push(Link(href)) |
| else: |
| self._Push(Text()) |
| |
| def EndAnchor(self): |
| self._Pop() |
| |
| def StartBlockquote(self): |
| if not self._IsWithinFragmentType(CodeBlock): |
| self._WriteFragmentsAsLine(newlines=1) |
| self._Push(BlockquoteBlock()) |
| else: |
| self._Push(Text()) |
| |
| def EndBlockquote(self): |
| if not self._IsWithinFragmentType(CodeBlock): |
| self._WriteFragmentsAsLine(newlines=2) |
| self._Pop() |
| |
| def Image(self, src, alt, width, height): |
| src = self._url_translator.Translate(src) |
| self._fragment_tree.AppendFragment(Image(src, alt, width, height)) |
| |
| def Iframe(self, src, width, height): |
| """Process an <iframe> element. |
| |
| Sites use <iframe> for embedded content: Docs, Drawings, etc. |
| g3doc implements this by supporting <iframe> HTML tag directly. |
| |
| Args: |
| src: Source URL. |
| width: Element width. |
| height: Element height. |
| """ |
| if False: |
| # TODO(dpranke): Figure out if we should support embedded IFRAME tags. |
| # For now, we skip over them. |
| self._WriteFragmentsAsLine(newlines=2) |
| self._Push(EmbeddedContent(src, width, height)) |
| self._Pop() |
| |
| def Text(self, text): |
| if not isinstance(self._fragment_tree.PeekFragmentFromEnd(), IgnoreBlock): |
| fragment = (CodeBlock() if self._IsWithinFragmentType(CodeBlock) |
| else Text()) |
| self._fragment_tree.AppendFragment(fragment) |
| fragment.Append(text) |
| |
| def _AddTableOfContents(self): |
| # TOC must be preceded and followed by a blank line |
| self._AddVerticallyPaddedParagraph('[TOC]') |
| |
| def _AddVerticallyPaddedParagraph(self, text): |
| self._WriteFragmentsAsLine(newlines=2) |
| fragment = CodeBlock() # Use CodeBlock to prevent escaping |
| self._fragment_tree.AppendFragment(fragment) |
| fragment.Append(text) |
| self._WriteFragmentsAsLine(newlines=2) |
| |
| def _WriteFragmentsAsLine(self, newlines): |
| """Writes out any content currently buffered in the fragment stack. |
| |
| Args: |
| newlines: The minimum number of newlines required in the output after this |
| line. These newlines won't be written out until the next line with |
| content is encountered. |
| """ |
| |
| # Generate indent and the content, then clear content in fragments. |
| indent = io.StringIO() |
| self._fragment_tree.ApplyToAllFragments( |
| lambda fragment: fragment.WriteIndent(indent), |
| lambda fragment: fragment.WriteContentIntoParentAndClear()) |
| last_fragment = self._fragment_tree.PeekFragmentFromEnd() |
| content = self._fragment_tree.PeekFragmentFromStart().ConsumeContent() |
| content = last_fragment.StripLine(content.getvalue()) |
| indent = indent.getvalue() |
| content = last_fragment.WrapLine(content, indent) |
| |
| # Write the content, if any. |
| if content: |
| self._out.write('\n' * self._pending_newlines) |
| self._out.write(indent) |
| self._out.write(content) |
| self._pending_newlines = newlines |
| elif self._pending_newlines > 0 and self._pending_newlines < newlines: |
| self._pending_newlines = newlines |
| |
| if FLAGS.debug_print_tree: |
| # Separate trees printed during each writing session |
| print('-' * 20) |
| |
| |
| class XhtmlHandler(xml.sax.ContentHandler): |
| """Translates SAX events into MarkdownGenerator calls.""" |
| |
| # regex that matches an HTML header tag and extracts the level. |
| _HEADER_TAG_RE = re.compile(r'h([1-6])$') |
| |
| def __init__(self, out, url_translator): |
| xml.sax.ContentHandler.__init__(self) |
| self._generator = MarkdownGenerator(out, url_translator) |
| |
| def startDocument(self): |
| self._generator.StartDocument() |
| |
| def endDocument(self): |
| self._generator.EndDocument() |
| |
| def startElementNS(self, name, qname, attrs): |
| tag = name[1] |
| if tag == 'a': |
| href = attrs.get((None, 'href')) |
| self._generator.StartAnchor(href) |
| elif tag == 'br': |
| self._generator.Break() |
| elif tag == 'hr': |
| self._generator.HorizontalRule() |
| elif tag == 'li': |
| self._generator.StartListItem() |
| elif tag == 'div': |
| cls = attrs.get((None, 'class')) |
| style = attrs.get((None, 'style')) |
| self._generator.StartDiv(cls, style) |
| elif tag == 'p': |
| self._generator.StartParagraph() |
| elif tag in ('b', 'code', 'em', 'i', 'strong', 's', 'strike', 'del', 'u'): |
| self._generator.StartFormat(tag) |
| elif tag in ('ul', 'ol'): |
| self._generator.StartList(tag) |
| elif tag == 'img': |
| src = attrs.get((None, 'src')) |
| alt = attrs.get((None, 'alt')) |
| width = attrs.get((None, 'width')) |
| height = attrs.get((None, 'height')) |
| self._generator.Image(src, alt, width, height) |
| elif tag == 'blockquote': |
| self._generator.StartBlockquote() |
| elif tag == 'iframe': |
| src = attrs.get((None, 'src')) |
| width = attrs.get((None, 'width')) |
| height = attrs.get((None, 'height')) |
| self._generator.Iframe(src, width, height) |
| else: |
| match = self._HEADER_TAG_RE.match(tag) |
| if match: |
| level = int(match.group(1)) |
| self._generator.StartHeader(level) |
| |
| def endElementNS(self, name, qname): |
| tag = name[1] |
| if tag == 'a': |
| self._generator.EndAnchor() |
| elif tag == 'li': |
| self._generator.EndListItem() |
| elif tag == 'div': |
| self._generator.EndDiv() |
| elif tag == 'p': |
| self._generator.EndParagraph() |
| elif tag in ('b', 'code', 'em', 'i', 'strong', 's', 'strike', 'del', 'u'): |
| self._generator.EndFormat() |
| elif tag in ('ul', 'ol'): |
| self._generator.EndList() |
| elif tag == 'blockquote': |
| self._generator.EndBlockquote() |
| else: |
| match = self._HEADER_TAG_RE.match(tag) |
| if match: |
| self._generator.EndHeader() |
| |
| def characters(self, content): |
| self._generator.Text(content) |
| |
| |
| class DefaultUrlTranslator: |
| """No-op UrlTranslator.""" |
| |
| def Translate(self, href): |
| return href |
| |
| |
| def Convert(input_stream, output_stream, url_translator=DefaultUrlTranslator()): |
| """Converts an input stream of xhtml into an output stream of markdown. |
| |
| Args: |
| input_stream: filehandle for the XHTML input. |
| output_stream: filehandle for the Markdown output. |
| url_translator: Callback for translating URLs embedded in the page. |
| """ |
| parser = xml.sax.make_parser() |
| parser.setContentHandler(XhtmlHandler(output_stream, url_translator)) |
| parser.setFeature(xml.sax.handler.feature_namespaces, 1) |
| parser.parse(input_stream) |