third_party/Python-Markdown/markdown/preprocessors.py - chromium/src - Git at Google

 """
 PRE-PROCESSORS
 =============================================================================

 Preprocessors work on source text before we start doing anything too
 complicated.
 """

 from __future__ import absolute_import
 from __future__ import unicode_literals
 from . import util
 from . import odict
 import re


 def build_preprocessors(md_instance, **kwargs):
     """ Build the default set of preprocessors used by Markdown. """
     preprocessors = odict.OrderedDict()
     preprocessors['normalize_whitespace'] = NormalizeWhitespace(md_instance)
     if md_instance.safeMode != 'escape':
         preprocessors["html_block"] = HtmlBlockPreprocessor(md_instance)
     preprocessors["reference"] = ReferencePreprocessor(md_instance)
     return preprocessors


 class Preprocessor(util.Processor):
     """
     Preprocessors are run after the text is broken into lines.

     Each preprocessor implements a "run" method that takes a pointer to a
     list of lines of the document, modifies it as necessary and returns
     either the same pointer or a pointer to a new list.

     Preprocessors must extend markdown.Preprocessor.

     """
     def run(self, lines):
         """
         Each subclass of Preprocessor should override the `run` method, which
         takes the document as a list of strings split by newlines and returns
         the (possibly modified) list of lines.

         """
         pass  # pragma: no cover


 class NormalizeWhitespace(Preprocessor):
     """ Normalize whitespace for consistant parsing. """

     def run(self, lines):
         source = '\n'.join(lines)
         source = source.replace(util.STX, "").replace(util.ETX, "")
         source = source.replace("\r\n", "\n").replace("\r", "\n") + "\n\n"
         source = source.expandtabs(self.markdown.tab_length)
         source = re.sub(r'(?<=\n) +\n', '\n', source)
         return source.split('\n')


 class HtmlBlockPreprocessor(Preprocessor):
     """Remove html blocks from the text and store them for later retrieval."""

     right_tag_patterns = ["</%s>", "%s>"]
     attrs_pattern = r"""
         \s+(?P<attr>[^>"'/= ]+)=(?P<q>['"])(?P<value>.*?)(?P=q) # attr="value"
         |                                                       # OR
         \s+(?P<attr1>[^>"'/= ]+)=(?P<value1>[^> ]+)             # attr=value
         |                                                       # OR
         \s+(?P<attr2>[^>"'/= ]+)                                # attr
         """
     left_tag_pattern = r'^\<(?P<tag>[^> ]+)(?P<attrs>(%s)*)\s*\/?\>?' % \
                        attrs_pattern
     attrs_re = re.compile(attrs_pattern, re.VERBOSE)
     left_tag_re = re.compile(left_tag_pattern, re.VERBOSE)
     markdown_in_raw = False

     def _get_left_tag(self, block):
         m = self.left_tag_re.match(block)
         if m:
             tag = m.group('tag')
             raw_attrs = m.group('attrs')
             attrs = {}
             if raw_attrs:
                 for ma in self.attrs_re.finditer(raw_attrs):
                     if ma.group('attr'):
                         if ma.group('value'):
                             attrs[ma.group('attr').strip()] = ma.group('value')
                         else:
                             attrs[ma.group('attr').strip()] = ""
                     elif ma.group('attr1'):
                         if ma.group('value1'):
                             attrs[ma.group('attr1').strip()] = ma.group(
                                 'value1'
                             )
                         else:
                             attrs[ma.group('attr1').strip()] = ""
                     elif ma.group('attr2'):
                         attrs[ma.group('attr2').strip()] = ""
             return tag, len(m.group(0)), attrs
         else:
             tag = block[1:].split(">", 1)[0].lower()
             return tag, len(tag)+2, {}

     def _recursive_tagfind(self, ltag, rtag, start_index, block):
         while 1:
             i = block.find(rtag, start_index)
             if i == -1:
                 return -1
             j = block.find(ltag, start_index)
             # if no ltag, or rtag found before another ltag, return index
             if (j > i or j == -1):
                 return i + len(rtag)
             # another ltag found before rtag, use end of ltag as starting
             # point and search again
             j = block.find('>', j)
             start_index = self._recursive_tagfind(ltag, rtag, j + 1, block)
             if start_index == -1:
                 # HTML potentially malformed- ltag has no corresponding
                 # rtag
                 return -1

     def _get_right_tag(self, left_tag, left_index, block):
         for p in self.right_tag_patterns:
             tag = p % left_tag
             i = self._recursive_tagfind(
                 "<%s" % left_tag, tag, left_index, block
             )
             if i > 2:
                 return tag.lstrip("<").rstrip(">"), i
         return block.rstrip()[-left_index:-1].lower(), len(block)

     def _equal_tags(self, left_tag, right_tag):
         if left_tag[0] in ['?', '@', '%']:  # handle PHP, etc.
             return True
         if ("/" + left_tag) == right_tag:
             return True
         if (right_tag == "--" and left_tag == "--"):
             return True
         elif left_tag == right_tag[1:] and right_tag[0] == "/":
             return True
         else:
             return False

     def _is_oneliner(self, tag):
         return (tag in ['hr', 'hr/'])

     def _stringindex_to_listindex(self, stringindex, items):
         """
         Same effect as concatenating the strings in items,
         finding the character to which stringindex refers in that string,
         and returning the index of the item in which that character resides.
         """
         items.append('dummy')
         i, count = 0, 0
         while count <= stringindex:
             count += len(items[i])
             i += 1
         return i - 1

     def _nested_markdown_in_html(self, items):
         """Find and process html child elements of the given element block."""
         for i, item in enumerate(items):
             if self.left_tag_re.match(item):
                 left_tag, left_index, attrs = \
                     self._get_left_tag(''.join(items[i:]))
                 right_tag, data_index = self._get_right_tag(
                     left_tag, left_index, ''.join(items[i:]))
                 right_listindex = \
                     self._stringindex_to_listindex(data_index, items[i:]) + i
                 if 'markdown' in attrs.keys():
                     items[i] = items[i][left_index:]  # remove opening tag
                     placeholder = self.markdown.htmlStash.store_tag(
                         left_tag, attrs, i + 1, right_listindex + 1)
                     items.insert(i, placeholder)
                     if len(items) - right_listindex <= 1:  # last nest, no tail
                         right_listindex -= 1
                     items[right_listindex] = items[right_listindex][
                         :-len(right_tag) - 2]  # remove closing tag
                 else:  # raw html
                     if len(items) - right_listindex <= 1:  # last element
                         right_listindex -= 1
                     offset = 1 if i == right_listindex else 0
                     placeholder = self.markdown.htmlStash.store('\n\n'.join(
                         items[i:right_listindex + offset]))
                     del items[i:right_listindex + offset]
                     items.insert(i, placeholder)
         return items

     def run(self, lines):
         text = "\n".join(lines)
         new_blocks = []
         text = text.rsplit("\n\n")
         items = []
         left_tag = ''
         right_tag = ''
         in_tag = False  # flag

         while text:
             block = text[0]
             if block.startswith("\n"):
                 block = block[1:]
             text = text[1:]

             if block.startswith("\n"):
                 block = block[1:]

             if not in_tag:
                 if block.startswith("<") and len(block.strip()) > 1:

                     if block[1:4] == "!--":
                         # is a comment block
                         left_tag, left_index, attrs = "--", 2, {}
                     else:
                         left_tag, left_index, attrs = self._get_left_tag(block)
                     right_tag, data_index = self._get_right_tag(left_tag,
                                                                 left_index,
                                                                 block)
                     # keep checking conditions below and maybe just append

                     if data_index < len(block) and (util.isBlockLevel(left_tag) or left_tag == '--'):
                         text.insert(0, block[data_index:])
                         block = block[:data_index]

                     if not (util.isBlockLevel(left_tag) or block[1] in ["!", "?", "@", "%"]):
                         new_blocks.append(block)
                         continue

                     if self._is_oneliner(left_tag):
                         new_blocks.append(block.strip())
                         continue

                     if block.rstrip().endswith(">") \
                             and self._equal_tags(left_tag, right_tag):
                         if self.markdown_in_raw and 'markdown' in attrs.keys():
                             block = block[left_index:-len(right_tag) - 2]
                             new_blocks.append(self.markdown.htmlStash.
                                               store_tag(left_tag, attrs, 0, 2))
                             new_blocks.extend([block])
                         else:
                             new_blocks.append(
                                 self.markdown.htmlStash.store(block.strip()))
                         continue
                     else:
                         # if is block level tag and is not complete
                         if (not self._equal_tags(left_tag, right_tag)) and \
                            (util.isBlockLevel(left_tag) or left_tag == "--"):
                             items.append(block.strip())
                             in_tag = True
                         else:
                             new_blocks.append(
                                 self.markdown.htmlStash.store(block.strip())
                             )
                         continue

                 else:
                     new_blocks.append(block)

             else:
                 items.append(block)

                 right_tag, data_index = self._get_right_tag(left_tag, 0, block)

                 if self._equal_tags(left_tag, right_tag):
                     # if find closing tag

                     if data_index < len(block):
                         # we have more text after right_tag
                         items[-1] = block[:data_index]
                         text.insert(0, block[data_index:])

                     in_tag = False
                     if self.markdown_in_raw and 'markdown' in attrs.keys():
                         items[0] = items[0][left_index:]
                         items[-1] = items[-1][:-len(right_tag) - 2]
                         if items[len(items) - 1]:  # not a newline/empty string
                             right_index = len(items) + 3
                         else:
                             right_index = len(items) + 2
                         new_blocks.append(self.markdown.htmlStash.store_tag(
                             left_tag, attrs, 0, right_index))
                         placeholderslen = len(self.markdown.htmlStash.tag_data)
                         new_blocks.extend(
                             self._nested_markdown_in_html(items))
                         nests = len(self.markdown.htmlStash.tag_data) - \
                             placeholderslen
                         self.markdown.htmlStash.tag_data[-1 - nests][
                             'right_index'] += nests - 2
                     else:
                         new_blocks.append(
                             self.markdown.htmlStash.store('\n\n'.join(items)))
                     items = []

         if items:
             if self.markdown_in_raw and 'markdown' in attrs.keys():
                 items[0] = items[0][left_index:]
                 items[-1] = items[-1][:-len(right_tag) - 2]
                 if items[len(items) - 1]:  # not a newline/empty string
                     right_index = len(items) + 3
                 else:
                     right_index = len(items) + 2
                 new_blocks.append(
                     self.markdown.htmlStash.store_tag(
                         left_tag, attrs, 0, right_index))
                 placeholderslen = len(self.markdown.htmlStash.tag_data)
                 new_blocks.extend(self._nested_markdown_in_html(items))
                 nests = len(self.markdown.htmlStash.tag_data) - placeholderslen
                 self.markdown.htmlStash.tag_data[-1 - nests][
                     'right_index'] += nests - 2
             else:
                 new_blocks.append(
                     self.markdown.htmlStash.store('\n\n'.join(items)))
             new_blocks.append('\n')

         new_text = "\n\n".join(new_blocks)
         return new_text.split("\n")


 class ReferencePreprocessor(Preprocessor):
     """ Remove reference definitions from text and store for later use. """

     TITLE = r'[ ]*(\"(.*)\"|\'(.*)\'|\((.*)\))[ ]*'
     RE = re.compile(
         r'^[ ]{0,3}\[([^\]]*)\]:\s*([^ ]*)[ ]*(%s)?$' % TITLE, re.DOTALL
     )
     TITLE_RE = re.compile(r'^%s$' % TITLE)

     def run(self, lines):
         new_text = []
         while lines:
             line = lines.pop(0)
             m = self.RE.match(line)
             if m:
                 id = m.group(1).strip().lower()
                 link = m.group(2).lstrip('<').rstrip('>')
                 t = m.group(5) or m.group(6) or m.group(7)
                 if not t:
                     # Check next line for title
                     tm = self.TITLE_RE.match(lines[0])
                     if tm:
                         lines.pop(0)
                         t = tm.group(2) or tm.group(3) or tm.group(4)
                 self.markdown.references[id] = (link, t)
             else:
                 new_text.append(line)

         return new_text  # + "\n"
	"""
	PRE-PROCESSORS
	=============================================================================

	Preprocessors work on source text before we start doing anything too
	complicated.
	"""

	from __future__ import absolute_import
	from __future__ import unicode_literals
	from . import util
	from . import odict
	import re


	def build_preprocessors(md_instance, **kwargs):
	""" Build the default set of preprocessors used by Markdown. """
	preprocessors = odict.OrderedDict()
	preprocessors['normalize_whitespace'] = NormalizeWhitespace(md_instance)
	if md_instance.safeMode != 'escape':
	preprocessors["html_block"] = HtmlBlockPreprocessor(md_instance)
	preprocessors["reference"] = ReferencePreprocessor(md_instance)
	return preprocessors


	class Preprocessor(util.Processor):
	"""
	Preprocessors are run after the text is broken into lines.

	Each preprocessor implements a "run" method that takes a pointer to a
	list of lines of the document, modifies it as necessary and returns
	either the same pointer or a pointer to a new list.

	Preprocessors must extend markdown.Preprocessor.

	"""
	def run(self, lines):
	"""
	Each subclass of Preprocessor should override the `run` method, which
	takes the document as a list of strings split by newlines and returns
	the (possibly modified) list of lines.

	"""
	pass # pragma: no cover


	class NormalizeWhitespace(Preprocessor):
	""" Normalize whitespace for consistant parsing. """

	def run(self, lines):
	source = '\n'.join(lines)
	source = source.replace(util.STX, "").replace(util.ETX, "")
	source = source.replace("\r\n", "\n").replace("\r", "\n") + "\n\n"
	source = source.expandtabs(self.markdown.tab_length)
	source = re.sub(r'(?<=\n) +\n', '\n', source)
	return source.split('\n')


	class HtmlBlockPreprocessor(Preprocessor):
	"""Remove html blocks from the text and store them for later retrieval."""

	right_tag_patterns = ["</%s>", "%s>"]
	attrs_pattern = r"""
	\s+(?P<attr>[^>"'/= ]+)=(?P<q>['"])(?P<value>.*?)(?P=q) # attr="value"
	\| # OR
	\s+(?P<attr1>[^>"'/= ]+)=(?P<value1>[^> ]+) # attr=value
	\| # OR
	\s+(?P<attr2>[^>"'/= ]+) # attr
	"""
	left_tag_pattern = r'^\<(?P<tag>[^> ]+)(?P<attrs>(%s))\s\/?\>?' % \
	attrs_pattern
	attrs_re = re.compile(attrs_pattern, re.VERBOSE)
	left_tag_re = re.compile(left_tag_pattern, re.VERBOSE)
	markdown_in_raw = False

	def _get_left_tag(self, block):
	m = self.left_tag_re.match(block)
	if m:
	tag = m.group('tag')
	raw_attrs = m.group('attrs')
	attrs = {}
	if raw_attrs:
	for ma in self.attrs_re.finditer(raw_attrs):
	if ma.group('attr'):
	if ma.group('value'):
	attrs[ma.group('attr').strip()] = ma.group('value')
	else:
	attrs[ma.group('attr').strip()] = ""
	elif ma.group('attr1'):
	if ma.group('value1'):
	attrs[ma.group('attr1').strip()] = ma.group(
	'value1'
	)
	else:
	attrs[ma.group('attr1').strip()] = ""
	elif ma.group('attr2'):
	attrs[ma.group('attr2').strip()] = ""
	return tag, len(m.group(0)), attrs
	else:
	tag = block[1:].split(">", 1)[0].lower()
	return tag, len(tag)+2, {}

	def _recursive_tagfind(self, ltag, rtag, start_index, block):
	while 1:
	i = block.find(rtag, start_index)
	if i == -1:
	return -1
	j = block.find(ltag, start_index)
	# if no ltag, or rtag found before another ltag, return index
	if (j > i or j == -1):
	return i + len(rtag)
	# another ltag found before rtag, use end of ltag as starting
	# point and search again
	j = block.find('>', j)
	start_index = self._recursive_tagfind(ltag, rtag, j + 1, block)
	if start_index == -1:
	# HTML potentially malformed- ltag has no corresponding
	# rtag
	return -1

	def _get_right_tag(self, left_tag, left_index, block):
	for p in self.right_tag_patterns:
	tag = p % left_tag
	i = self._recursive_tagfind(
	"<%s" % left_tag, tag, left_index, block
	)
	if i > 2:
	return tag.lstrip("<").rstrip(">"), i
	return block.rstrip()[-left_index:-1].lower(), len(block)

	def _equal_tags(self, left_tag, right_tag):
	if left_tag[0] in ['?', '@', '%']: # handle PHP, etc.
	return True
	if ("/" + left_tag) == right_tag:
	return True
	if (right_tag == "--" and left_tag == "--"):
	return True
	elif left_tag == right_tag[1:] and right_tag[0] == "/":
	return True
	else:
	return False

	def _is_oneliner(self, tag):
	return (tag in ['hr', 'hr/'])

	def _stringindex_to_listindex(self, stringindex, items):
	"""
	Same effect as concatenating the strings in items,
	finding the character to which stringindex refers in that string,
	and returning the index of the item in which that character resides.
	"""
	items.append('dummy')
	i, count = 0, 0
	while count <= stringindex:
	count += len(items[i])
	i += 1
	return i - 1

	def _nested_markdown_in_html(self, items):
	"""Find and process html child elements of the given element block."""
	for i, item in enumerate(items):
	if self.left_tag_re.match(item):
	left_tag, left_index, attrs = \
	self._get_left_tag(''.join(items[i:]))
	right_tag, data_index = self._get_right_tag(
	left_tag, left_index, ''.join(items[i:]))
	right_listindex = \
	self._stringindex_to_listindex(data_index, items[i:]) + i
	if 'markdown' in attrs.keys():
	items[i] = items[i][left_index:] # remove opening tag
	placeholder = self.markdown.htmlStash.store_tag(
	left_tag, attrs, i + 1, right_listindex + 1)
	items.insert(i, placeholder)
	if len(items) - right_listindex <= 1: # last nest, no tail
	right_listindex -= 1
	items[right_listindex] = items[right_listindex][
	:-len(right_tag) - 2] # remove closing tag
	else: # raw html
	if len(items) - right_listindex <= 1: # last element
	right_listindex -= 1
	offset = 1 if i == right_listindex else 0
	placeholder = self.markdown.htmlStash.store('\n\n'.join(
	items[i:right_listindex + offset]))
	del items[i:right_listindex + offset]
	items.insert(i, placeholder)
	return items

	def run(self, lines):
	text = "\n".join(lines)
	new_blocks = []
	text = text.rsplit("\n\n")
	items = []
	left_tag = ''
	right_tag = ''
	in_tag = False # flag

	while text:
	block = text[0]
	if block.startswith("\n"):
	block = block[1:]
	text = text[1:]

	if block.startswith("\n"):
	block = block[1:]

	if not in_tag:
	if block.startswith("<") and len(block.strip()) > 1:

	if block[1:4] == "!--":
	# is a comment block
	left_tag, left_index, attrs = "--", 2, {}
	else:
	left_tag, left_index, attrs = self._get_left_tag(block)
	right_tag, data_index = self._get_right_tag(left_tag,
	left_index,
	block)
	# keep checking conditions below and maybe just append

	if data_index < len(block) and (util.isBlockLevel(left_tag) or left_tag == '--'):
	text.insert(0, block[data_index:])
	block = block[:data_index]

	if not (util.isBlockLevel(left_tag) or block[1] in ["!", "?", "@", "%"]):
	new_blocks.append(block)
	continue

	if self._is_oneliner(left_tag):
	new_blocks.append(block.strip())
	continue

	if block.rstrip().endswith(">") \
	and self._equal_tags(left_tag, right_tag):
	if self.markdown_in_raw and 'markdown' in attrs.keys():
	block = block[left_index:-len(right_tag) - 2]
	new_blocks.append(self.markdown.htmlStash.
	store_tag(left_tag, attrs, 0, 2))
	new_blocks.extend([block])
	else:
	new_blocks.append(
	self.markdown.htmlStash.store(block.strip()))
	continue
	else:
	# if is block level tag and is not complete
	if (not self._equal_tags(left_tag, right_tag)) and \
	(util.isBlockLevel(left_tag) or left_tag == "--"):
	items.append(block.strip())
	in_tag = True
	else:
	new_blocks.append(
	self.markdown.htmlStash.store(block.strip())
	)
	continue

	else:
	new_blocks.append(block)

	else:
	items.append(block)

	right_tag, data_index = self._get_right_tag(left_tag, 0, block)

	if self._equal_tags(left_tag, right_tag):
	# if find closing tag

	if data_index < len(block):
	# we have more text after right_tag
	items[-1] = block[:data_index]
	text.insert(0, block[data_index:])

	in_tag = False
	if self.markdown_in_raw and 'markdown' in attrs.keys():
	items[0] = items[0][left_index:]
	items[-1] = items[-1][:-len(right_tag) - 2]
	if items[len(items) - 1]: # not a newline/empty string
	right_index = len(items) + 3
	else:
	right_index = len(items) + 2
	new_blocks.append(self.markdown.htmlStash.store_tag(
	left_tag, attrs, 0, right_index))
	placeholderslen = len(self.markdown.htmlStash.tag_data)
	new_blocks.extend(
	self._nested_markdown_in_html(items))
	nests = len(self.markdown.htmlStash.tag_data) - \
	placeholderslen
	self.markdown.htmlStash.tag_data[-1 - nests][
	'right_index'] += nests - 2
	else:
	new_blocks.append(
	self.markdown.htmlStash.store('\n\n'.join(items)))
	items = []

	if items:
	if self.markdown_in_raw and 'markdown' in attrs.keys():
	items[0] = items[0][left_index:]
	items[-1] = items[-1][:-len(right_tag) - 2]
	if items[len(items) - 1]: # not a newline/empty string
	right_index = len(items) + 3
	else:
	right_index = len(items) + 2
	new_blocks.append(
	self.markdown.htmlStash.store_tag(
	left_tag, attrs, 0, right_index))
	placeholderslen = len(self.markdown.htmlStash.tag_data)
	new_blocks.extend(self._nested_markdown_in_html(items))
	nests = len(self.markdown.htmlStash.tag_data) - placeholderslen
	self.markdown.htmlStash.tag_data[-1 - nests][
	'right_index'] += nests - 2
	else:
	new_blocks.append(
	self.markdown.htmlStash.store('\n\n'.join(items)))
	new_blocks.append('\n')

	new_text = "\n\n".join(new_blocks)
	return new_text.split("\n")


	class ReferencePreprocessor(Preprocessor):
	""" Remove reference definitions from text and store for later use. """

	TITLE = r'[ ](\"(.)\"\|\'(.)\'\|\((.)\))[ ]*'
	RE = re.compile(
	r'^[ ]{0,3}\[([^\]])\]:\s([^ ])[ ](%s)?$' % TITLE, re.DOTALL
	)
	TITLE_RE = re.compile(r'^%s$' % TITLE)

	def run(self, lines):
	new_text = []
	while lines:
	line = lines.pop(0)
	m = self.RE.match(line)
	if m:
	id = m.group(1).strip().lower()
	link = m.group(2).lstrip('<').rstrip('>')
	t = m.group(5) or m.group(6) or m.group(7)
	if not t:
	# Check next line for title
	tm = self.TITLE_RE.match(lines[0])
	if tm:
	lines.pop(0)
	t = tm.group(2) or tm.group(3) or tm.group(4)
	self.markdown.references[id] = (link, t)
	else:
	new_text.append(line)

	return new_text # + "\n"