| """ |
| Python-Markdown Markdown in HTML Extension |
| =============================== |
| |
| An implementation of [PHP Markdown Extra](http://michelf.com/projects/php-markdown/extra/)'s |
| parsing of Markdown syntax in raw HTML. |
| |
| See <https://Python-Markdown.github.io/extensions/raw_html> |
| for documentation. |
| |
| Copyright The Python Markdown Project |
| |
| License: [BSD](https://opensource.org/licenses/bsd-license.php) |
| |
| """ |
| |
| from . import Extension |
| from ..blockprocessors import BlockProcessor |
| from ..preprocessors import Preprocessor |
| from ..postprocessors import RawHtmlPostprocessor |
| from .. import util |
| from ..htmlparser import HTMLExtractor, blank_line_re |
| import xml.etree.ElementTree as etree |
| |
| |
| class HTMLExtractorExtra(HTMLExtractor): |
| """ |
| Override HTMLExtractor and create etree Elements for any elements which should have content parsed as Markdown. |
| """ |
| |
| def __init__(self, md, *args, **kwargs): |
| # All block-level tags. |
| self.block_level_tags = set(md.block_level_elements.copy()) |
| # Block-level tags in which the content only gets span level parsing |
| self.span_tags = set( |
| ['address', 'dd', 'dt', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'legend', 'li', 'p', 'td', 'th'] |
| ) |
| # Block-level tags which never get their content parsed. |
| self.raw_tags = set(['canvas', 'math', 'option', 'pre', 'script', 'style', 'textarea']) |
| # Block-level tags in which the content gets parsed as blocks |
| super().__init__(md, *args, **kwargs) |
| |
| self.block_tags = set(self.block_level_tags) - (self.span_tags | self.raw_tags | self.empty_tags) |
| self.span_and_blocks_tags = self.block_tags | self.span_tags |
| |
| def reset(self): |
| """Reset this instance. Loses all unprocessed data.""" |
| self.mdstack = [] # When markdown=1, stack contains a list of tags |
| self.treebuilder = etree.TreeBuilder() |
| self.mdstate = [] # one of 'block', 'span', 'off', or None |
| super().reset() |
| |
| def close(self): |
| """Handle any buffered data.""" |
| super().close() |
| # Handle any unclosed tags. |
| if self.mdstack: |
| # Close the outermost parent. handle_endtag will close all unclosed children. |
| self.handle_endtag(self.mdstack[0]) |
| |
| def get_element(self): |
| """ Return element from treebuilder and reset treebuilder for later use. """ |
| element = self.treebuilder.close() |
| self.treebuilder = etree.TreeBuilder() |
| return element |
| |
| def get_state(self, tag, attrs): |
| """ Return state from tag and `markdown` attr. One of 'block', 'span', or 'off'. """ |
| md_attr = attrs.get('markdown', '0') |
| if md_attr == 'markdown': |
| # `<tag markdown>` is the same as `<tag markdown='1'>`. |
| md_attr = '1' |
| parent_state = self.mdstate[-1] if self.mdstate else None |
| if parent_state == 'off' or (parent_state == 'span' and md_attr != '0'): |
| # Only use the parent state if it is more restrictive than the markdown attribute. |
| md_attr = parent_state |
| if ((md_attr == '1' and tag in self.block_tags) or |
| (md_attr == 'block' and tag in self.span_and_blocks_tags)): |
| return 'block' |
| elif ((md_attr == '1' and tag in self.span_tags) or |
| (md_attr == 'span' and tag in self.span_and_blocks_tags)): |
| return 'span' |
| elif tag in self.block_level_tags: |
| return 'off' |
| else: # pragma: no cover |
| return None |
| |
| def handle_starttag(self, tag, attrs): |
| # Handle tags that should always be empty and do not specify a closing tag |
| if tag in self.empty_tags and (self.at_line_start() or self.intail): |
| attrs = {key: value if value is not None else key for key, value in attrs} |
| if "markdown" in attrs: |
| attrs.pop('markdown') |
| element = etree.Element(tag, attrs) |
| data = etree.tostring(element, encoding='unicode', method='html') |
| else: |
| data = self.get_starttag_text() |
| self.handle_empty_tag(data, True) |
| return |
| |
| if tag in self.block_level_tags and (self.at_line_start() or self.intail): |
| # Valueless attr (ex: `<tag checked>`) results in `[('checked', None)]`. |
| # Convert to `{'checked': 'checked'}`. |
| attrs = {key: value if value is not None else key for key, value in attrs} |
| state = self.get_state(tag, attrs) |
| if self.inraw or (state in [None, 'off'] and not self.mdstack): |
| # fall back to default behavior |
| attrs.pop('markdown', None) |
| super().handle_starttag(tag, attrs) |
| else: |
| if 'p' in self.mdstack and tag in self.block_level_tags: |
| # Close unclosed 'p' tag |
| self.handle_endtag('p') |
| self.mdstate.append(state) |
| self.mdstack.append(tag) |
| attrs['markdown'] = state |
| self.treebuilder.start(tag, attrs) |
| else: |
| # Span level tag |
| if self.inraw: |
| super().handle_starttag(tag, attrs) |
| else: |
| text = self.get_starttag_text() |
| if self.mdstate and self.mdstate[-1] == "off": |
| self.handle_data(self.md.htmlStash.store(text)) |
| else: |
| self.handle_data(text) |
| if tag in self.CDATA_CONTENT_ELEMENTS: |
| # This is presumably a standalone tag in a code span (see #1036). |
| self.clear_cdata_mode() |
| |
| def handle_endtag(self, tag): |
| if tag in self.block_level_tags: |
| if self.inraw: |
| super().handle_endtag(tag) |
| elif tag in self.mdstack: |
| # Close element and any unclosed children |
| while self.mdstack: |
| item = self.mdstack.pop() |
| self.mdstate.pop() |
| self.treebuilder.end(item) |
| if item == tag: |
| break |
| if not self.mdstack: |
| # Last item in stack is closed. Stash it |
| element = self.get_element() |
| # Get last entry to see if it ends in newlines |
| # If it is an element, assume there is no newlines |
| item = self.cleandoc[-1] if self.cleandoc else '' |
| # If we only have one newline before block element, add another |
| if not item.endswith('\n\n') and item.endswith('\n'): |
| self.cleandoc.append('\n') |
| self.cleandoc.append(self.md.htmlStash.store(element)) |
| self.cleandoc.append('\n\n') |
| self.state = [] |
| # Check if element has a tail |
| if not blank_line_re.match( |
| self.rawdata[self.line_offset + self.offset + len(self.get_endtag_text(tag)):]): |
| # More content exists after endtag. |
| self.intail = True |
| else: |
| # Treat orphan closing tag as a span level tag. |
| text = self.get_endtag_text(tag) |
| if self.mdstate and self.mdstate[-1] == "off": |
| self.handle_data(self.md.htmlStash.store(text)) |
| else: |
| self.handle_data(text) |
| else: |
| # Span level tag |
| if self.inraw: |
| super().handle_endtag(tag) |
| else: |
| text = self.get_endtag_text(tag) |
| if self.mdstate and self.mdstate[-1] == "off": |
| self.handle_data(self.md.htmlStash.store(text)) |
| else: |
| self.handle_data(text) |
| |
| def handle_startendtag(self, tag, attrs): |
| if tag in self.empty_tags: |
| attrs = {key: value if value is not None else key for key, value in attrs} |
| if "markdown" in attrs: |
| attrs.pop('markdown') |
| element = etree.Element(tag, attrs) |
| data = etree.tostring(element, encoding='unicode', method='html') |
| else: |
| data = self.get_starttag_text() |
| else: |
| data = self.get_starttag_text() |
| self.handle_empty_tag(data, is_block=self.md.is_block_level(tag)) |
| |
| def handle_data(self, data): |
| if self.intail and '\n' in data: |
| self.intail = False |
| if self.inraw or not self.mdstack: |
| super().handle_data(data) |
| else: |
| self.treebuilder.data(data) |
| |
| def handle_empty_tag(self, data, is_block): |
| if self.inraw or not self.mdstack: |
| super().handle_empty_tag(data, is_block) |
| else: |
| if self.at_line_start() and is_block: |
| self.handle_data('\n' + self.md.htmlStash.store(data) + '\n\n') |
| else: |
| self.handle_data(self.md.htmlStash.store(data)) |
| |
| def parse_pi(self, i): |
| if self.at_line_start() or self.intail or self.mdstack: |
| # The same override exists in HTMLExtractor without the check |
| # for mdstack. Therefore, use HTMLExtractor's parent instead. |
| return super(HTMLExtractor, self).parse_pi(i) |
| # This is not the beginning of a raw block so treat as plain data |
| # and avoid consuming any tags which may follow (see #1066). |
| self.handle_data('<?') |
| return i + 2 |
| |
| def parse_html_declaration(self, i): |
| if self.at_line_start() or self.intail or self.mdstack: |
| # The same override exists in HTMLExtractor without the check |
| # for mdstack. Therefore, use HTMLExtractor's parent instead. |
| return super(HTMLExtractor, self).parse_html_declaration(i) |
| # This is not the beginning of a raw block so treat as plain data |
| # and avoid consuming any tags which may follow (see #1066). |
| self.handle_data('<!') |
| return i + 2 |
| |
| |
| class HtmlBlockPreprocessor(Preprocessor): |
| """Remove html blocks from the text and store them for later retrieval.""" |
| |
| def run(self, lines): |
| source = '\n'.join(lines) |
| parser = HTMLExtractorExtra(self.md) |
| parser.feed(source) |
| parser.close() |
| return ''.join(parser.cleandoc).split('\n') |
| |
| |
| class MarkdownInHtmlProcessor(BlockProcessor): |
| """Process Markdown Inside HTML Blocks which have been stored in the HtmlStash.""" |
| |
| def test(self, parent, block): |
| # ALways return True. `run` will return `False` it not a valid match. |
| return True |
| |
| def parse_element_content(self, element): |
| """ |
| Resursively parse the text content of an etree Element as Markdown. |
| |
| Any block level elements generated from the Markdown will be inserted as children of the element in place |
| of the text content. All `markdown` attributes are removed. For any elements in which Markdown parsing has |
| been dissabled, the text content of it and its chidlren are wrapped in an `AtomicString`. |
| """ |
| |
| md_attr = element.attrib.pop('markdown', 'off') |
| |
| if md_attr == 'block': |
| # Parse content as block level |
| # The order in which the different parts are parsed (text, children, tails) is important here as the |
| # order of elements needs to be preserved. We can't be inserting items at a later point in the current |
| # iteration as we don't want to do raw processing on elements created from parsing Markdown text (for |
| # example). Therefore, the order of operations is children, tails, text. |
| |
| # Recursively parse existing children from raw HTML |
| for child in list(element): |
| self.parse_element_content(child) |
| |
| # Parse Markdown text in tail of children. Do this seperate to avoid raw HTML parsing. |
| # Save the position of each item to be inserted later in reverse. |
| tails = [] |
| for pos, child in enumerate(element): |
| if child.tail: |
| block = child.tail.rstrip('\n') |
| child.tail = '' |
| # Use a dummy placeholder element. |
| dummy = etree.Element('div') |
| self.parser.parseBlocks(dummy, block.split('\n\n')) |
| children = list(dummy) |
| children.reverse() |
| tails.append((pos + 1, children)) |
| |
| # Insert the elements created from the tails in reverse. |
| tails.reverse() |
| for pos, tail in tails: |
| for item in tail: |
| element.insert(pos, item) |
| |
| # Parse Markdown text content. Do this last to avoid raw HTML parsing. |
| if element.text: |
| block = element.text.rstrip('\n') |
| element.text = '' |
| # Use a dummy placeholder element as the content needs to get inserted before existing children. |
| dummy = etree.Element('div') |
| self.parser.parseBlocks(dummy, block.split('\n\n')) |
| children = list(dummy) |
| children.reverse() |
| for child in children: |
| element.insert(0, child) |
| |
| elif md_attr == 'span': |
| # Span level parsing will be handled by inlineprocessors. |
| # Walk children here to remove any `markdown` attributes. |
| for child in list(element): |
| self.parse_element_content(child) |
| |
| else: |
| # Disable inline parsing for everything else |
| if element.text is None: |
| element.text = '' |
| element.text = util.AtomicString(element.text) |
| for child in list(element): |
| self.parse_element_content(child) |
| if child.tail: |
| child.tail = util.AtomicString(child.tail) |
| |
| def run(self, parent, blocks): |
| m = util.HTML_PLACEHOLDER_RE.match(blocks[0]) |
| if m: |
| index = int(m.group(1)) |
| element = self.parser.md.htmlStash.rawHtmlBlocks[index] |
| if isinstance(element, etree.Element): |
| # We have a matched element. Process it. |
| blocks.pop(0) |
| self.parse_element_content(element) |
| parent.append(element) |
| # Cleanup stash. Replace element with empty string to avoid confusing postprocessor. |
| self.parser.md.htmlStash.rawHtmlBlocks.pop(index) |
| self.parser.md.htmlStash.rawHtmlBlocks.insert(index, '') |
| # Comfirm the match to the blockparser. |
| return True |
| # No match found. |
| return False |
| |
| |
| class MarkdownInHTMLPostprocessor(RawHtmlPostprocessor): |
| def stash_to_string(self, text): |
| """ Override default to handle any etree elements still in the stash. """ |
| if isinstance(text, etree.Element): |
| return self.md.serializer(text) |
| else: |
| return str(text) |
| |
| |
| class MarkdownInHtmlExtension(Extension): |
| """Add Markdown parsing in HTML to Markdown class.""" |
| |
| def extendMarkdown(self, md): |
| """ Register extension instances. """ |
| |
| # Replace raw HTML preprocessor |
| md.preprocessors.register(HtmlBlockPreprocessor(md), 'html_block', 20) |
| # Add blockprocessor which handles the placeholders for etree elements |
| md.parser.blockprocessors.register( |
| MarkdownInHtmlProcessor(md.parser), 'markdown_block', 105 |
| ) |
| # Replace raw HTML postprocessor |
| md.postprocessors.register(MarkdownInHTMLPostprocessor(md), 'raw_html', 30) |
| |
| |
| def makeExtension(**kwargs): # pragma: no cover |
| return MarkdownInHtmlExtension(**kwargs) |