third_party/markdown/preprocessors.py - chromium/src - Git at Google

 # markdown is released under the BSD license
 # Copyright 2007, 2008 The Python Markdown Project (v. 1.7 and later)
 # Copyright 2004, 2005, 2006 Yuri Takhteyev (v. 0.2-1.6b)
 # Copyright 2004 Manfred Stienstra (the original version)
 #
 # All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions are met:
 #
 # *   Redistributions of source code must retain the above copyright
 #     notice, this list of conditions and the following disclaimer.
 # *   Redistributions in binary form must reproduce the above copyright
 #     notice, this list of conditions and the following disclaimer in the
 #     documentation and/or other materials provided with the distribution.
 # *   Neither the name of the <organization> nor the
 #     names of its contributors may be used to endorse or promote products
 #     derived from this software without specific prior written permission.
 #
 # THIS SOFTWARE IS PROVIDED BY THE PYTHON MARKDOWN PROJECT ''AS IS'' AND ANY
 # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
 # WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 # DISCLAIMED. IN NO EVENT SHALL ANY CONTRIBUTORS TO THE PYTHON MARKDOWN PROJECT
 # BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 # CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 # SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 # INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 # CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 # ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 # POSSIBILITY OF SUCH DAMAGE.


 """
 PRE-PROCESSORS
 =============================================================================

 Preprocessors work on source text before we start doing anything too
 complicated.
 """

 from __future__ import absolute_import
 from __future__ import unicode_literals
 from . import util
 from . import odict
 import re


 def build_preprocessors(md_instance, **kwargs):
     """ Build the default set of preprocessors used by Markdown. """
     preprocessors = odict.OrderedDict()
     preprocessors['normalize_whitespace'] = NormalizeWhitespace(md_instance)
     if md_instance.safeMode != 'escape':
         preprocessors["html_block"] = HtmlBlockPreprocessor(md_instance)
     preprocessors["reference"] = ReferencePreprocessor(md_instance)
     return preprocessors


 class Preprocessor(util.Processor):
     """
     Preprocessors are run after the text is broken into lines.

     Each preprocessor implements a "run" method that takes a pointer to a
     list of lines of the document, modifies it as necessary and returns
     either the same pointer or a pointer to a new list.

     Preprocessors must extend markdown.Preprocessor.

     """
     def run(self, lines):
         """
         Each subclass of Preprocessor should override the `run` method, which
         takes the document as a list of strings split by newlines and returns
         the (possibly modified) list of lines.

         """
         pass


 class NormalizeWhitespace(Preprocessor):
     """ Normalize whitespace for consistant parsing. """

     def run(self, lines):
         source = '\n'.join(lines)
         source = source.replace(util.STX, "").replace(util.ETX, "")
         source = source.replace("\r\n", "\n").replace("\r", "\n") + "\n\n"
         source = source.expandtabs(self.markdown.tab_length)
         source = re.sub(r'(?<=\n) +\n', '\n', source)
         return source.split('\n')


 class HtmlBlockPreprocessor(Preprocessor):
     """Remove html blocks from the text and store them for later retrieval."""

     right_tag_patterns = ["</%s>", "%s>"]
     attrs_pattern = r"""
         \s+(?P<attr>[^>"'/= ]+)=(?P<q>['"])(?P<value>.*?)(?P=q)   # attr="value"
         |                                                         # OR
         \s+(?P<attr1>[^>"'/= ]+)=(?P<value1>[^> ]+)               # attr=value
         |                                                         # OR
         \s+(?P<attr2>[^>"'/= ]+)                                  # attr
         """
     left_tag_pattern = r'^\<(?P<tag>[^> ]+)(?P<attrs>(%s)*)\s*\/?\>?' % attrs_pattern
     attrs_re = re.compile(attrs_pattern, re.VERBOSE)
     left_tag_re = re.compile(left_tag_pattern, re.VERBOSE)
     markdown_in_raw = False

     def _get_left_tag(self, block):
         m = self.left_tag_re.match(block)
         if m:
             tag = m.group('tag')
             raw_attrs = m.group('attrs')
             attrs = {}
             if raw_attrs:
                 for ma in self.attrs_re.finditer(raw_attrs):
                     if ma.group('attr'):
                         if ma.group('value'):
                             attrs[ma.group('attr').strip()] = ma.group('value')
                         else:
                             attrs[ma.group('attr').strip()] = ""
                     elif ma.group('attr1'):
                         if ma.group('value1'):
                             attrs[ma.group('attr1').strip()] = ma.group('value1')
                         else:
                             attrs[ma.group('attr1').strip()] = ""
                     elif ma.group('attr2'):
                         attrs[ma.group('attr2').strip()] = ""
             return tag, len(m.group(0)), attrs
         else:
             tag = block[1:].split(">", 1)[0].lower()
             return tag, len(tag)+2, {}

     def _recursive_tagfind(self, ltag, rtag, start_index, block):
         while 1:
             i = block.find(rtag, start_index)
             if i == -1:
                 return -1
             j = block.find(ltag, start_index)
             # if no ltag, or rtag found before another ltag, return index
             if (j > i or j == -1):
                 return i + len(rtag)
             # another ltag found before rtag, use end of ltag as starting
             # point and search again
             j = block.find('>', j)
             start_index = self._recursive_tagfind(ltag, rtag, j + 1, block)
             if start_index == -1:
                 # HTML potentially malformed- ltag has no corresponding
                 # rtag
                 return -1

     def _get_right_tag(self, left_tag, left_index, block):
         for p in self.right_tag_patterns:
             tag = p % left_tag
             i = self._recursive_tagfind("<%s" % left_tag, tag, left_index, block)
             if i > 2:
                 return tag.lstrip("<").rstrip(">"), i
         return block.rstrip()[-left_index:-1].lower(), len(block)

     def _equal_tags(self, left_tag, right_tag):
         if left_tag[0] in ['?', '@', '%']: # handle PHP, etc.
             return True
         if ("/" + left_tag) == right_tag:
             return True
         if (right_tag == "--" and left_tag == "--"):
             return True
         elif left_tag == right_tag[1:] \
             and right_tag[0] == "/":
             return True
         else:
             return False

     def _is_oneliner(self, tag):
         return (tag in ['hr', 'hr/'])

     def run(self, lines):
         text = "\n".join(lines)
         new_blocks = []
         text = text.rsplit("\n\n")
         items = []
         left_tag = ''
         right_tag = ''
         in_tag = False # flag

         while text:
             block = text[0]
             if block.startswith("\n"):
                 block = block[1:]
             text = text[1:]

             if block.startswith("\n"):
                 block = block[1:]

             if not in_tag:
                 if block.startswith("<") and len(block.strip()) > 1:

                     if block[1] == "!":
                         # is a comment block
                         left_tag, left_index, attrs  = "--", 2, {}
                     else:
                         left_tag, left_index, attrs = self._get_left_tag(block)
                     right_tag, data_index = self._get_right_tag(left_tag,
                                                                 left_index,
                                                                 block)
                     # keep checking conditions below and maybe just append

                     if data_index < len(block) \
                         and (util.isBlockLevel(left_tag)
                         or left_tag == '--'):
                         text.insert(0, block[data_index:])
                         block = block[:data_index]

                     if not (util.isBlockLevel(left_tag) \
                         or block[1] in ["!", "?", "@", "%"]):
                         new_blocks.append(block)
                         continue

                     if self._is_oneliner(left_tag):
                         new_blocks.append(block.strip())
                         continue

                     if block.rstrip().endswith(">") \
                         and self._equal_tags(left_tag, right_tag):
                         if self.markdown_in_raw and 'markdown' in attrs.keys():
                             start = re.sub(r'\smarkdown(=[\'"]?[^> ]*[\'"]?)?',
                                            '', block[:left_index])
                             end = block[-len(right_tag)-2:]
                             block = block[left_index:-len(right_tag)-2]
                             new_blocks.append(
                                 self.markdown.htmlStash.store(start))
                             new_blocks.append(block)
                             new_blocks.append(
                                 self.markdown.htmlStash.store(end))
                         else:
                             new_blocks.append(
                                 self.markdown.htmlStash.store(block.strip()))
                         continue
                     else:
                         # if is block level tag and is not complete

                         if util.isBlockLevel(left_tag) or left_tag == "--" \
                             and not block.rstrip().endswith(">"):
                             items.append(block.strip())
                             in_tag = True
                         else:
                             new_blocks.append(
                             self.markdown.htmlStash.store(block.strip()))

                         continue

                 new_blocks.append(block)

             else:
                 items.append(block)

                 right_tag, data_index = self._get_right_tag(left_tag, 0, block)

                 if self._equal_tags(left_tag, right_tag):
                     # if find closing tag

                     if data_index < len(block):
                         # we have more text after right_tag
                         items[-1] = block[:data_index]
                         text.insert(0, block[data_index:])

                     in_tag = False
                     if self.markdown_in_raw and 'markdown' in attrs.keys():
                         start = re.sub(r'\smarkdown(=[\'"]?[^> ]*[\'"]?)?',
                                        '', items[0][:left_index])
                         items[0] = items[0][left_index:]
                         end = items[-1][-len(right_tag)-2:]
                         items[-1] = items[-1][:-len(right_tag)-2]
                         new_blocks.append(
                             self.markdown.htmlStash.store(start))
                         new_blocks.extend(items)
                         new_blocks.append(
                             self.markdown.htmlStash.store(end))
                     else:
                         new_blocks.append(
                             self.markdown.htmlStash.store('\n\n'.join(items)))
                     items = []

         if items:
             if self.markdown_in_raw and 'markdown' in attrs.keys():
                 start = re.sub(r'\smarkdown(=[\'"]?[^> ]*[\'"]?)?',
                                '', items[0][:left_index])
                 items[0] = items[0][left_index:]
                 end = items[-1][-len(right_tag)-2:]
                 items[-1] = items[-1][:-len(right_tag)-2]
                 new_blocks.append(
                     self.markdown.htmlStash.store(start))
                 new_blocks.extend(items)
                 if end.strip():
                     new_blocks.append(
                         self.markdown.htmlStash.store(end))
             else:
                 new_blocks.append(
                     self.markdown.htmlStash.store('\n\n'.join(items)))
             #new_blocks.append(self.markdown.htmlStash.store('\n\n'.join(items)))
             new_blocks.append('\n')

         new_text = "\n\n".join(new_blocks)
         return new_text.split("\n")


 class ReferencePreprocessor(Preprocessor):
     """ Remove reference definitions from text and store for later use. """

     TITLE = r'[ ]*(\"(.*)\"|\'(.*)\'|\((.*)\))[ ]*'
     RE = re.compile(r'^[ ]{0,3}\[([^\]]*)\]:\s*([^ ]*)[ ]*(%s)?$' % TITLE, re.DOTALL)
     TITLE_RE = re.compile(r'^%s$' % TITLE)

     def run (self, lines):
         new_text = [];
         while lines:
             line = lines.pop(0)
             m = self.RE.match(line)
             if m:
                 id = m.group(1).strip().lower()
                 link = m.group(2).lstrip('<').rstrip('>')
                 t = m.group(5) or m.group(6) or m.group(7)
                 if not t:
                     # Check next line for title
                     tm = self.TITLE_RE.match(lines[0])
                     if tm:
                         lines.pop(0)
                         t = tm.group(2) or tm.group(3) or tm.group(4)
                 self.markdown.references[id] = (link, t)
             else:
                 new_text.append(line)

         return new_text #+ "\n"
	# markdown is released under the BSD license
	# Copyright 2007, 2008 The Python Markdown Project (v. 1.7 and later)
	# Copyright 2004, 2005, 2006 Yuri Takhteyev (v. 0.2-1.6b)
	# Copyright 2004 Manfred Stienstra (the original version)
	#
	# All rights reserved.
	#
	# Redistribution and use in source and binary forms, with or without
	# modification, are permitted provided that the following conditions are met:
	#
	# * Redistributions of source code must retain the above copyright
	# notice, this list of conditions and the following disclaimer.
	# * Redistributions in binary form must reproduce the above copyright
	# notice, this list of conditions and the following disclaimer in the
	# documentation and/or other materials provided with the distribution.
	# * Neither the name of the <organization> nor the
	# names of its contributors may be used to endorse or promote products
	# derived from this software without specific prior written permission.
	#
	# THIS SOFTWARE IS PROVIDED BY THE PYTHON MARKDOWN PROJECT ''AS IS'' AND ANY
	# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
	# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
	# DISCLAIMED. IN NO EVENT SHALL ANY CONTRIBUTORS TO THE PYTHON MARKDOWN PROJECT
	# BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
	# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
	# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
	# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
	# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
	# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
	# POSSIBILITY OF SUCH DAMAGE.


	"""
	PRE-PROCESSORS
	=============================================================================

	Preprocessors work on source text before we start doing anything too
	complicated.
	"""

	from __future__ import absolute_import
	from __future__ import unicode_literals
	from . import util
	from . import odict
	import re


	def build_preprocessors(md_instance, **kwargs):
	""" Build the default set of preprocessors used by Markdown. """
	preprocessors = odict.OrderedDict()
	preprocessors['normalize_whitespace'] = NormalizeWhitespace(md_instance)
	if md_instance.safeMode != 'escape':
	preprocessors["html_block"] = HtmlBlockPreprocessor(md_instance)
	preprocessors["reference"] = ReferencePreprocessor(md_instance)
	return preprocessors


	class Preprocessor(util.Processor):
	"""
	Preprocessors are run after the text is broken into lines.

	Each preprocessor implements a "run" method that takes a pointer to a
	list of lines of the document, modifies it as necessary and returns
	either the same pointer or a pointer to a new list.

	Preprocessors must extend markdown.Preprocessor.

	"""
	def run(self, lines):
	"""
	Each subclass of Preprocessor should override the `run` method, which
	takes the document as a list of strings split by newlines and returns
	the (possibly modified) list of lines.

	"""
	pass


	class NormalizeWhitespace(Preprocessor):
	""" Normalize whitespace for consistant parsing. """

	def run(self, lines):
	source = '\n'.join(lines)
	source = source.replace(util.STX, "").replace(util.ETX, "")
	source = source.replace("\r\n", "\n").replace("\r", "\n") + "\n\n"
	source = source.expandtabs(self.markdown.tab_length)
	source = re.sub(r'(?<=\n) +\n', '\n', source)
	return source.split('\n')


	class HtmlBlockPreprocessor(Preprocessor):
	"""Remove html blocks from the text and store them for later retrieval."""

	right_tag_patterns = ["</%s>", "%s>"]
	attrs_pattern = r"""
	\s+(?P<attr>[^>"'/= ]+)=(?P<q>['"])(?P<value>.*?)(?P=q) # attr="value"
	\| # OR
	\s+(?P<attr1>[^>"'/= ]+)=(?P<value1>[^> ]+) # attr=value
	\| # OR
	\s+(?P<attr2>[^>"'/= ]+) # attr
	"""
	left_tag_pattern = r'^\<(?P<tag>[^> ]+)(?P<attrs>(%s))\s\/?\>?' % attrs_pattern
	attrs_re = re.compile(attrs_pattern, re.VERBOSE)
	left_tag_re = re.compile(left_tag_pattern, re.VERBOSE)
	markdown_in_raw = False

	def _get_left_tag(self, block):
	m = self.left_tag_re.match(block)
	if m:
	tag = m.group('tag')
	raw_attrs = m.group('attrs')
	attrs = {}
	if raw_attrs:
	for ma in self.attrs_re.finditer(raw_attrs):
	if ma.group('attr'):
	if ma.group('value'):
	attrs[ma.group('attr').strip()] = ma.group('value')
	else:
	attrs[ma.group('attr').strip()] = ""
	elif ma.group('attr1'):
	if ma.group('value1'):
	attrs[ma.group('attr1').strip()] = ma.group('value1')
	else:
	attrs[ma.group('attr1').strip()] = ""
	elif ma.group('attr2'):
	attrs[ma.group('attr2').strip()] = ""
	return tag, len(m.group(0)), attrs
	else:
	tag = block[1:].split(">", 1)[0].lower()
	return tag, len(tag)+2, {}

	def _recursive_tagfind(self, ltag, rtag, start_index, block):
	while 1:
	i = block.find(rtag, start_index)
	if i == -1:
	return -1
	j = block.find(ltag, start_index)
	# if no ltag, or rtag found before another ltag, return index
	if (j > i or j == -1):
	return i + len(rtag)
	# another ltag found before rtag, use end of ltag as starting
	# point and search again
	j = block.find('>', j)
	start_index = self._recursive_tagfind(ltag, rtag, j + 1, block)
	if start_index == -1:
	# HTML potentially malformed- ltag has no corresponding
	# rtag
	return -1

	def _get_right_tag(self, left_tag, left_index, block):
	for p in self.right_tag_patterns:
	tag = p % left_tag
	i = self._recursive_tagfind("<%s" % left_tag, tag, left_index, block)
	if i > 2:
	return tag.lstrip("<").rstrip(">"), i
	return block.rstrip()[-left_index:-1].lower(), len(block)

	def _equal_tags(self, left_tag, right_tag):
	if left_tag[0] in ['?', '@', '%']: # handle PHP, etc.
	return True
	if ("/" + left_tag) == right_tag:
	return True
	if (right_tag == "--" and left_tag == "--"):
	return True
	elif left_tag == right_tag[1:] \
	and right_tag[0] == "/":
	return True
	else:
	return False

	def _is_oneliner(self, tag):
	return (tag in ['hr', 'hr/'])

	def run(self, lines):
	text = "\n".join(lines)
	new_blocks = []
	text = text.rsplit("\n\n")
	items = []
	left_tag = ''
	right_tag = ''
	in_tag = False # flag

	while text:
	block = text[0]
	if block.startswith("\n"):
	block = block[1:]
	text = text[1:]

	if block.startswith("\n"):
	block = block[1:]

	if not in_tag:
	if block.startswith("<") and len(block.strip()) > 1:

	if block[1] == "!":
	# is a comment block
	left_tag, left_index, attrs = "--", 2, {}
	else:
	left_tag, left_index, attrs = self._get_left_tag(block)
	right_tag, data_index = self._get_right_tag(left_tag,
	left_index,
	block)
	# keep checking conditions below and maybe just append

	if data_index < len(block) \
	and (util.isBlockLevel(left_tag)
	or left_tag == '--'):
	text.insert(0, block[data_index:])
	block = block[:data_index]

	if not (util.isBlockLevel(left_tag) \
	or block[1] in ["!", "?", "@", "%"]):
	new_blocks.append(block)
	continue

	if self._is_oneliner(left_tag):
	new_blocks.append(block.strip())
	continue

	if block.rstrip().endswith(">") \
	and self._equal_tags(left_tag, right_tag):
	if self.markdown_in_raw and 'markdown' in attrs.keys():
	start = re.sub(r'\smarkdown(=[\'"]?[^> ]*[\'"]?)?',
	'', block[:left_index])
	end = block[-len(right_tag)-2:]
	block = block[left_index:-len(right_tag)-2]
	new_blocks.append(
	self.markdown.htmlStash.store(start))
	new_blocks.append(block)
	new_blocks.append(
	self.markdown.htmlStash.store(end))
	else:
	new_blocks.append(
	self.markdown.htmlStash.store(block.strip()))
	continue
	else:
	# if is block level tag and is not complete

	if util.isBlockLevel(left_tag) or left_tag == "--" \
	and not block.rstrip().endswith(">"):
	items.append(block.strip())
	in_tag = True
	else:
	new_blocks.append(
	self.markdown.htmlStash.store(block.strip()))

	continue

	new_blocks.append(block)

	else:
	items.append(block)

	right_tag, data_index = self._get_right_tag(left_tag, 0, block)

	if self._equal_tags(left_tag, right_tag):
	# if find closing tag

	if data_index < len(block):
	# we have more text after right_tag
	items[-1] = block[:data_index]
	text.insert(0, block[data_index:])

	in_tag = False
	if self.markdown_in_raw and 'markdown' in attrs.keys():
	start = re.sub(r'\smarkdown(=[\'"]?[^> ]*[\'"]?)?',
	'', items[0][:left_index])
	items[0] = items[0][left_index:]
	end = items[-1][-len(right_tag)-2:]
	items[-1] = items[-1][:-len(right_tag)-2]
	new_blocks.append(
	self.markdown.htmlStash.store(start))
	new_blocks.extend(items)
	new_blocks.append(
	self.markdown.htmlStash.store(end))
	else:
	new_blocks.append(
	self.markdown.htmlStash.store('\n\n'.join(items)))
	items = []

	if items:
	if self.markdown_in_raw and 'markdown' in attrs.keys():
	start = re.sub(r'\smarkdown(=[\'"]?[^> ]*[\'"]?)?',
	'', items[0][:left_index])
	items[0] = items[0][left_index:]
	end = items[-1][-len(right_tag)-2:]
	items[-1] = items[-1][:-len(right_tag)-2]
	new_blocks.append(
	self.markdown.htmlStash.store(start))
	new_blocks.extend(items)
	if end.strip():
	new_blocks.append(
	self.markdown.htmlStash.store(end))
	else:
	new_blocks.append(
	self.markdown.htmlStash.store('\n\n'.join(items)))
	#new_blocks.append(self.markdown.htmlStash.store('\n\n'.join(items)))
	new_blocks.append('\n')

	new_text = "\n\n".join(new_blocks)
	return new_text.split("\n")


	class ReferencePreprocessor(Preprocessor):
	""" Remove reference definitions from text and store for later use. """

	TITLE = r'[ ](\"(.)\"\|\'(.)\'\|\((.)\))[ ]*'
	RE = re.compile(r'^[ ]{0,3}\[([^\]])\]:\s([^ ])[ ](%s)?$' % TITLE, re.DOTALL)
	TITLE_RE = re.compile(r'^%s$' % TITLE)

	def run (self, lines):
	new_text = [];
	while lines:
	line = lines.pop(0)
	m = self.RE.match(line)
	if m:
	id = m.group(1).strip().lower()
	link = m.group(2).lstrip('<').rstrip('>')
	t = m.group(5) or m.group(6) or m.group(7)
	if not t:
	# Check next line for title
	tm = self.TITLE_RE.match(lines[0])
	if tm:
	lines.pop(0)
	t = tm.group(2) or tm.group(3) or tm.group(4)
	self.markdown.references[id] = (link, t)
	else:
	new_text.append(line)

	return new_text #+ "\n"