markdown/htmlparser.py - chromium/src/third_party/Python-Markdown - Git at Google

 """
 Python Markdown

 A Python implementation of John Gruber's Markdown.

 Documentation: https://python-markdown.github.io/
 GitHub: https://github.com/Python-Markdown/markdown/
 PyPI: https://pypi.org/project/Markdown/

 Started by Manfred Stienstra (http://www.dwerg.net/).
 Maintained for a few years by Yuri Takhteyev (http://www.freewisdom.org).
 Currently maintained by Waylan Limberg (https://github.com/waylan),
 Dmitry Shachnev (https://github.com/mitya57) and Isaac Muse (https://github.com/facelessuser).

 Copyright 2007-2020 The Python Markdown Project (v. 1.7 and later)
 Copyright 2004, 2005, 2006 Yuri Takhteyev (v. 0.2-1.6b)
 Copyright 2004 Manfred Stienstra (the original version)

 License: BSD (see LICENSE.md for details).
 """

 import re
 import importlib.util
 import sys


 # Import a copy of the html.parser lib as `htmlparser` so we can monkeypatch it.
 # Users can still do `from html import parser` and get the default behavior.
 spec = importlib.util.find_spec('html.parser')
 htmlparser = importlib.util.module_from_spec(spec)
 spec.loader.exec_module(htmlparser)
 sys.modules['htmlparser'] = htmlparser

 # Monkeypatch HTMLParser to only accept `?>` to close Processing Instructions.
 htmlparser.piclose = re.compile(r'\?>')
 # Monkeypatch HTMLParser to only recognize entity references with a closing semicolon.
 htmlparser.entityref = re.compile(r'&([a-zA-Z][-.a-zA-Z0-9]*);')
 # Monkeypatch HTMLParser to no longer support partial entities. We are always feeding a complete block,
 # so the 'incomplete' functionality is unnecessary. As the entityref regex is run right before incomplete,
 # and the two regex are the same, then incomplete will simply never match and we avoid the logic within.
 htmlparser.incomplete = htmlparser.entityref
 # Monkeypatch HTMLParser to not accept a backtick in a tag name, attribute name, or bare value.
 htmlparser.locatestarttagend_tolerant = re.compile(r"""
   <[a-zA-Z][^`\t\n\r\f />\x00]*       # tag name <= added backtick here
   (?:[\s/]*                           # optional whitespace before attribute name
     (?:(?<=['"\s/])[^`\s/>][^\s/=>]*  # attribute name <= added backtick here
       (?:\s*=+\s*                     # value indicator
         (?:'[^']*'                    # LITA-enclosed value
           |"[^"]*"                    # LIT-enclosed value
           |(?!['"])[^`>\s]*           # bare value <= added backtick here
          )
          (?:\s*,)*                    # possibly followed by a comma
        )?(?:\s|/(?!>))*
      )*
    )?
   \s*                                 # trailing whitespace
 """, re.VERBOSE)

 # Match a blank line at the start of a block of text (two newlines).
 # The newlines may be preceded by additional whitespace.
 blank_line_re = re.compile(r'^([ ]*\n){2}')


 class HTMLExtractor(htmlparser.HTMLParser):
     """
     Extract raw HTML from text.

     The raw HTML is stored in the `htmlStash` of the Markdown instance passed
     to `md` and the remaining text is stored in `cleandoc` as a list of strings.
     """

     def __init__(self, md, *args, **kwargs):
         if 'convert_charrefs' not in kwargs:
             kwargs['convert_charrefs'] = False

         # Block tags that should contain no content (self closing)
         self.empty_tags = set(['hr'])

         # This calls self.reset
         super().__init__(*args, **kwargs)
         self.md = md

     def reset(self):
         """Reset this instance.  Loses all unprocessed data."""
         self.inraw = False
         self.intail = False
         self.stack = []  # When inraw==True, stack contains a list of tags
         self._cache = []
         self.cleandoc = []
         super().reset()

     def close(self):
         """Handle any buffered data."""
         super().close()
         if len(self.rawdata):
             # Temp fix for https://bugs.python.org/issue41989
             # TODO: remove this when the bug is fixed in all supported Python versions.
             if self.convert_charrefs and not self.cdata_elem:  # pragma: no cover
                 self.handle_data(htmlparser.unescape(self.rawdata))
             else:
                 self.handle_data(self.rawdata)
         # Handle any unclosed tags.
         if len(self._cache):
             self.cleandoc.append(self.md.htmlStash.store(''.join(self._cache)))
             self._cache = []

     @property
     def line_offset(self):
         """Returns char index in self.rawdata for the start of the current line. """
         if self.lineno > 1 and '\n' in self.rawdata:
             m = re.match(r'([^\n]*\n){{{}}}'.format(self.lineno-1), self.rawdata)
             if m:
                 return m.end()
             else:  # pragma: no cover
                 # Value of self.lineno must exceed total number of lines.
                 # Find index of beginning of last line.
                 return self.rawdata.rfind('\n')
         return 0

     def at_line_start(self):
         """
         Returns True if current position is at start of line.

         Allows for up to three blank spaces at start of line.
         """
         if self.offset == 0:
             return True
         if self.offset > 3:
             return False
         # Confirm up to first 3 chars are whitespace
         return self.rawdata[self.line_offset:self.line_offset + self.offset].strip() == ''

     def get_endtag_text(self, tag):
         """
         Returns the text of the end tag.

         If it fails to extract the actual text from the raw data, it builds a closing tag with `tag`.
         """
         # Attempt to extract actual tag from raw source text
         start = self.line_offset + self.offset
         m = htmlparser.endendtag.search(self.rawdata, start)
         if m:
             return self.rawdata[start:m.end()]
         else:  # pragma: no cover
             # Failed to extract from raw data. Assume well formed and lowercase.
             return '</{}>'.format(tag)

     def handle_starttag(self, tag, attrs):
         # Handle tags that should always be empty and do not specify a closing tag
         if tag in self.empty_tags:
             self.handle_startendtag(tag, attrs)
             return

         if self.md.is_block_level(tag) and (self.intail or (self.at_line_start() and not self.inraw)):
             # Started a new raw block. Prepare stack.
             self.inraw = True
             self.cleandoc.append('\n')

         text = self.get_starttag_text()
         if self.inraw:
             self.stack.append(tag)
             self._cache.append(text)
         else:
             self.cleandoc.append(text)
             if tag in self.CDATA_CONTENT_ELEMENTS:
                 # This is presumably a standalone tag in a code span (see #1036).
                 self.clear_cdata_mode()

     def handle_endtag(self, tag):
         text = self.get_endtag_text(tag)

         if self.inraw:
             self._cache.append(text)
             if tag in self.stack:
                 # Remove tag from stack
                 while self.stack:
                     if self.stack.pop() == tag:
                         break
             if len(self.stack) == 0:
                 # End of raw block.
                 if blank_line_re.match(self.rawdata[self.line_offset + self.offset + len(text):]):
                     # Preserve blank line and end of raw block.
                     self._cache.append('\n')
                 else:
                     # More content exists after endtag.
                     self.intail = True
                 # Reset stack.
                 self.inraw = False
                 self.cleandoc.append(self.md.htmlStash.store(''.join(self._cache)))
                 # Insert blank line between this and next line.
                 self.cleandoc.append('\n\n')
                 self._cache = []
         else:
             self.cleandoc.append(text)

     def handle_data(self, data):
         if self.intail and '\n' in data:
             self.intail = False
         if self.inraw:
             self._cache.append(data)
         else:
             self.cleandoc.append(data)

     def handle_empty_tag(self, data, is_block):
         """ Handle empty tags (`<data>`). """
         if self.inraw or self.intail:
             # Append this to the existing raw block
             self._cache.append(data)
         elif self.at_line_start() and is_block:
             # Handle this as a standalone raw block
             if blank_line_re.match(self.rawdata[self.line_offset + self.offset + len(data):]):
                 # Preserve blank line after tag in raw block.
                 data += '\n'
             else:
                 # More content exists after tag.
                 self.intail = True
             item = self.cleandoc[-1] if self.cleandoc else ''
             # If we only have one newline before block element, add another
             if not item.endswith('\n\n') and item.endswith('\n'):
                 self.cleandoc.append('\n')
             self.cleandoc.append(self.md.htmlStash.store(data))
             # Insert blank line between this and next line.
             self.cleandoc.append('\n\n')
         else:
             self.cleandoc.append(data)

     def handle_startendtag(self, tag, attrs):
         self.handle_empty_tag(self.get_starttag_text(), is_block=self.md.is_block_level(tag))

     def handle_charref(self, name):
         self.handle_empty_tag('&#{};'.format(name), is_block=False)

     def handle_entityref(self, name):
         self.handle_empty_tag('&{};'.format(name), is_block=False)

     def handle_comment(self, data):
         self.handle_empty_tag('<!--{}-->'.format(data), is_block=True)

     def handle_decl(self, data):
         self.handle_empty_tag('<!{}>'.format(data), is_block=True)

     def handle_pi(self, data):
         self.handle_empty_tag('<?{}?>'.format(data), is_block=True)

     def unknown_decl(self, data):
         end = ']]>' if data.startswith('CDATA[') else ']>'
         self.handle_empty_tag('<![{}{}'.format(data, end), is_block=True)

     def parse_pi(self, i):
         if self.at_line_start() or self.intail:
             return super().parse_pi(i)
         # This is not the beginning of a raw block so treat as plain data
         # and avoid consuming any tags which may follow (see #1066).
         self.handle_data('<?')
         return i + 2

     def parse_html_declaration(self, i):
         if self.at_line_start() or self.intail:
             return super().parse_html_declaration(i)
         # This is not the beginning of a raw block so treat as plain data
         # and avoid consuming any tags which may follow (see #1066).
         self.handle_data('<!')
         return i + 2

     # The rest has been copied from base class in standard lib to address #1036.
     # As __startag_text is private, all references to it must be in this subclass.
     # The last few lines of parse_starttag are reversed so that handle_starttag
     # can override cdata_mode in certain situations (in a code span).
     __starttag_text = None

     def get_starttag_text(self):
         """Return full source of start tag: '<...>'."""
         return self.__starttag_text

     def parse_starttag(self, i):  # pragma: no cover
         self.__starttag_text = None
         endpos = self.check_for_whole_start_tag(i)
         if endpos < 0:
             return endpos
         rawdata = self.rawdata
         self.__starttag_text = rawdata[i:endpos]

         # Now parse the data between i+1 and j into a tag and attrs
         attrs = []
         match = htmlparser.tagfind_tolerant.match(rawdata, i+1)
         assert match, 'unexpected call to parse_starttag()'
         k = match.end()
         self.lasttag = tag = match.group(1).lower()
         while k < endpos:
             m = htmlparser.attrfind_tolerant.match(rawdata, k)
             if not m:
                 break
             attrname, rest, attrvalue = m.group(1, 2, 3)
             if not rest:
                 attrvalue = None
             elif attrvalue[:1] == '\'' == attrvalue[-1:] or \
                  attrvalue[:1] == '"' == attrvalue[-1:]:  # noqa: E127
                 attrvalue = attrvalue[1:-1]
             if attrvalue:
                 attrvalue = htmlparser.unescape(attrvalue)
             attrs.append((attrname.lower(), attrvalue))
             k = m.end()

         end = rawdata[k:endpos].strip()
         if end not in (">", "/>"):
             lineno, offset = self.getpos()
             if "\n" in self.__starttag_text:
                 lineno = lineno + self.__starttag_text.count("\n")
                 offset = len(self.__starttag_text) \
                          - self.__starttag_text.rfind("\n")  # noqa: E127
             else:
                 offset = offset + len(self.__starttag_text)
             self.handle_data(rawdata[i:endpos])
             return endpos
         if end.endswith('/>'):
             # XHTML-style empty tag: <span attr="value" />
             self.handle_startendtag(tag, attrs)
         else:
             # *** set cdata_mode first so we can override it in handle_starttag (see #1036) ***
             if tag in self.CDATA_CONTENT_ELEMENTS:
                 self.set_cdata_mode(tag)
             self.handle_starttag(tag, attrs)
         return endpos
	"""
	Python Markdown

	A Python implementation of John Gruber's Markdown.

	Documentation: https://python-markdown.github.io/
	GitHub: https://github.com/Python-Markdown/markdown/
	PyPI: https://pypi.org/project/Markdown/

	Started by Manfred Stienstra (http://www.dwerg.net/).
	Maintained for a few years by Yuri Takhteyev (http://www.freewisdom.org).
	Currently maintained by Waylan Limberg (https://github.com/waylan),
	Dmitry Shachnev (https://github.com/mitya57) and Isaac Muse (https://github.com/facelessuser).

	Copyright 2007-2020 The Python Markdown Project (v. 1.7 and later)
	Copyright 2004, 2005, 2006 Yuri Takhteyev (v. 0.2-1.6b)
	Copyright 2004 Manfred Stienstra (the original version)

	License: BSD (see LICENSE.md for details).
	"""

	import re
	import importlib.util
	import sys


	# Import a copy of the html.parser lib as `htmlparser` so we can monkeypatch it.
	# Users can still do `from html import parser` and get the default behavior.
	spec = importlib.util.find_spec('html.parser')
	htmlparser = importlib.util.module_from_spec(spec)
	spec.loader.exec_module(htmlparser)
	sys.modules['htmlparser'] = htmlparser

	# Monkeypatch HTMLParser to only accept `?>` to close Processing Instructions.
	htmlparser.piclose = re.compile(r'\?>')
	# Monkeypatch HTMLParser to only recognize entity references with a closing semicolon.
	htmlparser.entityref = re.compile(r'&([a-zA-Z][-.a-zA-Z0-9]*);')
	# Monkeypatch HTMLParser to no longer support partial entities. We are always feeding a complete block,
	# so the 'incomplete' functionality is unnecessary. As the entityref regex is run right before incomplete,
	# and the two regex are the same, then incomplete will simply never match and we avoid the logic within.
	htmlparser.incomplete = htmlparser.entityref
	# Monkeypatch HTMLParser to not accept a backtick in a tag name, attribute name, or bare value.
	htmlparser.locatestarttagend_tolerant = re.compile(r"""
	<[a-zA-Z][^`\t\n\r\f />\x00]* # tag name <= added backtick here
	(?:[\s/]* # optional whitespace before attribute name
	(?:(?<=['"\s/])[^`\s/>][^\s/=>]* # attribute name <= added backtick here
	(?:\s=+\s # value indicator
	(?:'[^']*' # LITA-enclosed value
	\|"[^"]*" # LIT-enclosed value
	\|(?!['"])[^`>\s]* # bare value <= added backtick here
	)
	(?:\s,) # possibly followed by a comma
	)?(?:\s\|/(?!>))*
	)*
	)?
	\s* # trailing whitespace
	""", re.VERBOSE)

	# Match a blank line at the start of a block of text (two newlines).
	# The newlines may be preceded by additional whitespace.
	blank_line_re = re.compile(r'^([ ]*\n){2}')


	class HTMLExtractor(htmlparser.HTMLParser):
	"""
	Extract raw HTML from text.

	The raw HTML is stored in the `htmlStash` of the Markdown instance passed
	to `md` and the remaining text is stored in `cleandoc` as a list of strings.
	"""

	def __init__(self, md, args, *kwargs):
	if 'convert_charrefs' not in kwargs:
	kwargs['convert_charrefs'] = False

	# Block tags that should contain no content (self closing)
	self.empty_tags = set(['hr'])

	# This calls self.reset
	super().__init__(args, *kwargs)
	self.md = md

	def reset(self):
	"""Reset this instance. Loses all unprocessed data."""
	self.inraw = False
	self.intail = False
	self.stack = [] # When inraw==True, stack contains a list of tags
	self._cache = []
	self.cleandoc = []
	super().reset()

	def close(self):
	"""Handle any buffered data."""
	super().close()
	if len(self.rawdata):
	# Temp fix for https://bugs.python.org/issue41989
	# TODO: remove this when the bug is fixed in all supported Python versions.
	if self.convert_charrefs and not self.cdata_elem: # pragma: no cover
	self.handle_data(htmlparser.unescape(self.rawdata))
	else:
	self.handle_data(self.rawdata)
	# Handle any unclosed tags.
	if len(self._cache):
	self.cleandoc.append(self.md.htmlStash.store(''.join(self._cache)))
	self._cache = []

	@property
	def line_offset(self):
	"""Returns char index in self.rawdata for the start of the current line. """
	if self.lineno > 1 and '\n' in self.rawdata:
	m = re.match(r'([^\n]*\n){{{}}}'.format(self.lineno-1), self.rawdata)
	if m:
	return m.end()
	else: # pragma: no cover
	# Value of self.lineno must exceed total number of lines.
	# Find index of beginning of last line.
	return self.rawdata.rfind('\n')
	return 0

	def at_line_start(self):
	"""
	Returns True if current position is at start of line.

	Allows for up to three blank spaces at start of line.
	"""
	if self.offset == 0:
	return True
	if self.offset > 3:
	return False
	# Confirm up to first 3 chars are whitespace
	return self.rawdata[self.line_offset:self.line_offset + self.offset].strip() == ''

	def get_endtag_text(self, tag):
	"""
	Returns the text of the end tag.

	If it fails to extract the actual text from the raw data, it builds a closing tag with `tag`.
	"""
	# Attempt to extract actual tag from raw source text
	start = self.line_offset + self.offset
	m = htmlparser.endendtag.search(self.rawdata, start)
	if m:
	return self.rawdata[start:m.end()]
	else: # pragma: no cover
	# Failed to extract from raw data. Assume well formed and lowercase.
	return '</{}>'.format(tag)

	def handle_starttag(self, tag, attrs):
	# Handle tags that should always be empty and do not specify a closing tag
	if tag in self.empty_tags:
	self.handle_startendtag(tag, attrs)
	return

	if self.md.is_block_level(tag) and (self.intail or (self.at_line_start() and not self.inraw)):
	# Started a new raw block. Prepare stack.
	self.inraw = True
	self.cleandoc.append('\n')

	text = self.get_starttag_text()
	if self.inraw:
	self.stack.append(tag)
	self._cache.append(text)
	else:
	self.cleandoc.append(text)
	if tag in self.CDATA_CONTENT_ELEMENTS:
	# This is presumably a standalone tag in a code span (see #1036).
	self.clear_cdata_mode()

	def handle_endtag(self, tag):
	text = self.get_endtag_text(tag)

	if self.inraw:
	self._cache.append(text)
	if tag in self.stack:
	# Remove tag from stack
	while self.stack:
	if self.stack.pop() == tag:
	break
	if len(self.stack) == 0:
	# End of raw block.
	if blank_line_re.match(self.rawdata[self.line_offset + self.offset + len(text):]):
	# Preserve blank line and end of raw block.
	self._cache.append('\n')
	else:
	# More content exists after endtag.
	self.intail = True
	# Reset stack.
	self.inraw = False
	self.cleandoc.append(self.md.htmlStash.store(''.join(self._cache)))
	# Insert blank line between this and next line.
	self.cleandoc.append('\n\n')
	self._cache = []
	else:
	self.cleandoc.append(text)

	def handle_data(self, data):
	if self.intail and '\n' in data:
	self.intail = False
	if self.inraw:
	self._cache.append(data)
	else:
	self.cleandoc.append(data)

	def handle_empty_tag(self, data, is_block):
	""" Handle empty tags (`<data>`). """
	if self.inraw or self.intail:
	# Append this to the existing raw block
	self._cache.append(data)
	elif self.at_line_start() and is_block:
	# Handle this as a standalone raw block
	if blank_line_re.match(self.rawdata[self.line_offset + self.offset + len(data):]):
	# Preserve blank line after tag in raw block.
	data += '\n'
	else:
	# More content exists after tag.
	self.intail = True
	item = self.cleandoc[-1] if self.cleandoc else ''
	# If we only have one newline before block element, add another
	if not item.endswith('\n\n') and item.endswith('\n'):
	self.cleandoc.append('\n')
	self.cleandoc.append(self.md.htmlStash.store(data))
	# Insert blank line between this and next line.
	self.cleandoc.append('\n\n')
	else:
	self.cleandoc.append(data)

	def handle_startendtag(self, tag, attrs):
	self.handle_empty_tag(self.get_starttag_text(), is_block=self.md.is_block_level(tag))

	def handle_charref(self, name):
	self.handle_empty_tag('&#{};'.format(name), is_block=False)

	def handle_entityref(self, name):
	self.handle_empty_tag('&{};'.format(name), is_block=False)

	def handle_comment(self, data):
	self.handle_empty_tag('<!--{}-->'.format(data), is_block=True)

	def handle_decl(self, data):
	self.handle_empty_tag('<!{}>'.format(data), is_block=True)

	def handle_pi(self, data):
	self.handle_empty_tag('<?{}?>'.format(data), is_block=True)

	def unknown_decl(self, data):
	end = ']]>' if data.startswith('CDATA[') else ']>'
	self.handle_empty_tag('<![{}{}'.format(data, end), is_block=True)

	def parse_pi(self, i):
	if self.at_line_start() or self.intail:
	return super().parse_pi(i)
	# This is not the beginning of a raw block so treat as plain data
	# and avoid consuming any tags which may follow (see #1066).
	self.handle_data('<?')
	return i + 2

	def parse_html_declaration(self, i):
	if self.at_line_start() or self.intail:
	return super().parse_html_declaration(i)
	# This is not the beginning of a raw block so treat as plain data
	# and avoid consuming any tags which may follow (see #1066).
	self.handle_data('<!')
	return i + 2

	# The rest has been copied from base class in standard lib to address #1036.
	# As __startag_text is private, all references to it must be in this subclass.
	# The last few lines of parse_starttag are reversed so that handle_starttag
	# can override cdata_mode in certain situations (in a code span).
	__starttag_text = None

	def get_starttag_text(self):
	"""Return full source of start tag: '<...>'."""
	return self.__starttag_text

	def parse_starttag(self, i): # pragma: no cover
	self.__starttag_text = None
	endpos = self.check_for_whole_start_tag(i)
	if endpos < 0:
	return endpos
	rawdata = self.rawdata
	self.__starttag_text = rawdata[i:endpos]

	# Now parse the data between i+1 and j into a tag and attrs
	attrs = []
	match = htmlparser.tagfind_tolerant.match(rawdata, i+1)
	assert match, 'unexpected call to parse_starttag()'
	k = match.end()
	self.lasttag = tag = match.group(1).lower()
	while k < endpos:
	m = htmlparser.attrfind_tolerant.match(rawdata, k)
	if not m:
	break
	attrname, rest, attrvalue = m.group(1, 2, 3)
	if not rest:
	attrvalue = None
	elif attrvalue[:1] == '\'' == attrvalue[-1:] or \
	attrvalue[:1] == '"' == attrvalue[-1:]: # noqa: E127
	attrvalue = attrvalue[1:-1]
	if attrvalue:
	attrvalue = htmlparser.unescape(attrvalue)
	attrs.append((attrname.lower(), attrvalue))
	k = m.end()

	end = rawdata[k:endpos].strip()
	if end not in (">", "/>"):
	lineno, offset = self.getpos()
	if "\n" in self.__starttag_text:
	lineno = lineno + self.__starttag_text.count("\n")
	offset = len(self.__starttag_text) \
	- self.__starttag_text.rfind("\n") # noqa: E127
	else:
	offset = offset + len(self.__starttag_text)
	self.handle_data(rawdata[i:endpos])
	return endpos
	if end.endswith('/>'):
	# XHTML-style empty tag: <span attr="value" />
	self.handle_startendtag(tag, attrs)
	else:
	# * set cdata_mode first so we can override it in handle_starttag (see #1036) *
	if tag in self.CDATA_CONTENT_ELEMENTS:
	self.set_cdata_mode(tag)
	self.handle_starttag(tag, attrs)
	return endpos