| """ |
| PRE-PROCESSORS |
| ============================================================================= |
| |
| Preprocessors work on source text before we start doing anything too |
| complicated. |
| """ |
| |
| from __future__ import absolute_import |
| from __future__ import unicode_literals |
| from . import util |
| from . import odict |
| import re |
| |
| |
| def build_preprocessors(md_instance, **kwargs): |
| """ Build the default set of preprocessors used by Markdown. """ |
| preprocessors = odict.OrderedDict() |
| preprocessors['normalize_whitespace'] = NormalizeWhitespace(md_instance) |
| if md_instance.safeMode != 'escape': |
| preprocessors["html_block"] = HtmlBlockPreprocessor(md_instance) |
| preprocessors["reference"] = ReferencePreprocessor(md_instance) |
| return preprocessors |
| |
| |
| class Preprocessor(util.Processor): |
| """ |
| Preprocessors are run after the text is broken into lines. |
| |
| Each preprocessor implements a "run" method that takes a pointer to a |
| list of lines of the document, modifies it as necessary and returns |
| either the same pointer or a pointer to a new list. |
| |
| Preprocessors must extend markdown.Preprocessor. |
| |
| """ |
| def run(self, lines): |
| """ |
| Each subclass of Preprocessor should override the `run` method, which |
| takes the document as a list of strings split by newlines and returns |
| the (possibly modified) list of lines. |
| |
| """ |
| pass # pragma: no cover |
| |
| |
| class NormalizeWhitespace(Preprocessor): |
| """ Normalize whitespace for consistant parsing. """ |
| |
| def run(self, lines): |
| source = '\n'.join(lines) |
| source = source.replace(util.STX, "").replace(util.ETX, "") |
| source = source.replace("\r\n", "\n").replace("\r", "\n") + "\n\n" |
| source = source.expandtabs(self.markdown.tab_length) |
| source = re.sub(r'(?<=\n) +\n', '\n', source) |
| return source.split('\n') |
| |
| |
| class HtmlBlockPreprocessor(Preprocessor): |
| """Remove html blocks from the text and store them for later retrieval.""" |
| |
| right_tag_patterns = ["</%s>", "%s>"] |
| attrs_pattern = r""" |
| \s+(?P<attr>[^>"'/= ]+)=(?P<q>['"])(?P<value>.*?)(?P=q) # attr="value" |
| | # OR |
| \s+(?P<attr1>[^>"'/= ]+)=(?P<value1>[^> ]+) # attr=value |
| | # OR |
| \s+(?P<attr2>[^>"'/= ]+) # attr |
| """ |
| left_tag_pattern = r'^\<(?P<tag>[^> ]+)(?P<attrs>(%s)*)\s*\/?\>?' % \ |
| attrs_pattern |
| attrs_re = re.compile(attrs_pattern, re.VERBOSE) |
| left_tag_re = re.compile(left_tag_pattern, re.VERBOSE) |
| markdown_in_raw = False |
| |
| def _get_left_tag(self, block): |
| m = self.left_tag_re.match(block) |
| if m: |
| tag = m.group('tag') |
| raw_attrs = m.group('attrs') |
| attrs = {} |
| if raw_attrs: |
| for ma in self.attrs_re.finditer(raw_attrs): |
| if ma.group('attr'): |
| if ma.group('value'): |
| attrs[ma.group('attr').strip()] = ma.group('value') |
| else: |
| attrs[ma.group('attr').strip()] = "" |
| elif ma.group('attr1'): |
| if ma.group('value1'): |
| attrs[ma.group('attr1').strip()] = ma.group( |
| 'value1' |
| ) |
| else: |
| attrs[ma.group('attr1').strip()] = "" |
| elif ma.group('attr2'): |
| attrs[ma.group('attr2').strip()] = "" |
| return tag, len(m.group(0)), attrs |
| else: |
| tag = block[1:].split(">", 1)[0].lower() |
| return tag, len(tag)+2, {} |
| |
| def _recursive_tagfind(self, ltag, rtag, start_index, block): |
| while 1: |
| i = block.find(rtag, start_index) |
| if i == -1: |
| return -1 |
| j = block.find(ltag, start_index) |
| # if no ltag, or rtag found before another ltag, return index |
| if (j > i or j == -1): |
| return i + len(rtag) |
| # another ltag found before rtag, use end of ltag as starting |
| # point and search again |
| j = block.find('>', j) |
| start_index = self._recursive_tagfind(ltag, rtag, j + 1, block) |
| if start_index == -1: |
| # HTML potentially malformed- ltag has no corresponding |
| # rtag |
| return -1 |
| |
| def _get_right_tag(self, left_tag, left_index, block): |
| for p in self.right_tag_patterns: |
| tag = p % left_tag |
| i = self._recursive_tagfind( |
| "<%s" % left_tag, tag, left_index, block |
| ) |
| if i > 2: |
| return tag.lstrip("<").rstrip(">"), i |
| return block.rstrip()[-left_index:-1].lower(), len(block) |
| |
| def _equal_tags(self, left_tag, right_tag): |
| if left_tag[0] in ['?', '@', '%']: # handle PHP, etc. |
| return True |
| if ("/" + left_tag) == right_tag: |
| return True |
| if (right_tag == "--" and left_tag == "--"): |
| return True |
| elif left_tag == right_tag[1:] and right_tag[0] == "/": |
| return True |
| else: |
| return False |
| |
| def _is_oneliner(self, tag): |
| return (tag in ['hr', 'hr/']) |
| |
| def _stringindex_to_listindex(self, stringindex, items): |
| """ |
| Same effect as concatenating the strings in items, |
| finding the character to which stringindex refers in that string, |
| and returning the index of the item in which that character resides. |
| """ |
| items.append('dummy') |
| i, count = 0, 0 |
| while count <= stringindex: |
| count += len(items[i]) |
| i += 1 |
| return i - 1 |
| |
| def _nested_markdown_in_html(self, items): |
| """Find and process html child elements of the given element block.""" |
| for i, item in enumerate(items): |
| if self.left_tag_re.match(item): |
| left_tag, left_index, attrs = \ |
| self._get_left_tag(''.join(items[i:])) |
| right_tag, data_index = self._get_right_tag( |
| left_tag, left_index, ''.join(items[i:])) |
| right_listindex = \ |
| self._stringindex_to_listindex(data_index, items[i:]) + i |
| if 'markdown' in attrs.keys(): |
| items[i] = items[i][left_index:] # remove opening tag |
| placeholder = self.markdown.htmlStash.store_tag( |
| left_tag, attrs, i + 1, right_listindex + 1) |
| items.insert(i, placeholder) |
| if len(items) - right_listindex <= 1: # last nest, no tail |
| right_listindex -= 1 |
| items[right_listindex] = items[right_listindex][ |
| :-len(right_tag) - 2] # remove closing tag |
| else: # raw html |
| if len(items) - right_listindex <= 1: # last element |
| right_listindex -= 1 |
| offset = 1 if i == right_listindex else 0 |
| placeholder = self.markdown.htmlStash.store('\n\n'.join( |
| items[i:right_listindex + offset])) |
| del items[i:right_listindex + offset] |
| items.insert(i, placeholder) |
| return items |
| |
| def run(self, lines): |
| text = "\n".join(lines) |
| new_blocks = [] |
| text = text.rsplit("\n\n") |
| items = [] |
| left_tag = '' |
| right_tag = '' |
| in_tag = False # flag |
| |
| while text: |
| block = text[0] |
| if block.startswith("\n"): |
| block = block[1:] |
| text = text[1:] |
| |
| if block.startswith("\n"): |
| block = block[1:] |
| |
| if not in_tag: |
| if block.startswith("<") and len(block.strip()) > 1: |
| |
| if block[1:4] == "!--": |
| # is a comment block |
| left_tag, left_index, attrs = "--", 2, {} |
| else: |
| left_tag, left_index, attrs = self._get_left_tag(block) |
| right_tag, data_index = self._get_right_tag(left_tag, |
| left_index, |
| block) |
| # keep checking conditions below and maybe just append |
| |
| if data_index < len(block) and (util.isBlockLevel(left_tag) or left_tag == '--'): |
| text.insert(0, block[data_index:]) |
| block = block[:data_index] |
| |
| if not (util.isBlockLevel(left_tag) or block[1] in ["!", "?", "@", "%"]): |
| new_blocks.append(block) |
| continue |
| |
| if self._is_oneliner(left_tag): |
| new_blocks.append(block.strip()) |
| continue |
| |
| if block.rstrip().endswith(">") \ |
| and self._equal_tags(left_tag, right_tag): |
| if self.markdown_in_raw and 'markdown' in attrs.keys(): |
| block = block[left_index:-len(right_tag) - 2] |
| new_blocks.append(self.markdown.htmlStash. |
| store_tag(left_tag, attrs, 0, 2)) |
| new_blocks.extend([block]) |
| else: |
| new_blocks.append( |
| self.markdown.htmlStash.store(block.strip())) |
| continue |
| else: |
| # if is block level tag and is not complete |
| if (not self._equal_tags(left_tag, right_tag)) and \ |
| (util.isBlockLevel(left_tag) or left_tag == "--"): |
| items.append(block.strip()) |
| in_tag = True |
| else: |
| new_blocks.append( |
| self.markdown.htmlStash.store(block.strip()) |
| ) |
| continue |
| |
| else: |
| new_blocks.append(block) |
| |
| else: |
| items.append(block) |
| |
| right_tag, data_index = self._get_right_tag(left_tag, 0, block) |
| |
| if self._equal_tags(left_tag, right_tag): |
| # if find closing tag |
| |
| if data_index < len(block): |
| # we have more text after right_tag |
| items[-1] = block[:data_index] |
| text.insert(0, block[data_index:]) |
| |
| in_tag = False |
| if self.markdown_in_raw and 'markdown' in attrs.keys(): |
| items[0] = items[0][left_index:] |
| items[-1] = items[-1][:-len(right_tag) - 2] |
| if items[len(items) - 1]: # not a newline/empty string |
| right_index = len(items) + 3 |
| else: |
| right_index = len(items) + 2 |
| new_blocks.append(self.markdown.htmlStash.store_tag( |
| left_tag, attrs, 0, right_index)) |
| placeholderslen = len(self.markdown.htmlStash.tag_data) |
| new_blocks.extend( |
| self._nested_markdown_in_html(items)) |
| nests = len(self.markdown.htmlStash.tag_data) - \ |
| placeholderslen |
| self.markdown.htmlStash.tag_data[-1 - nests][ |
| 'right_index'] += nests - 2 |
| else: |
| new_blocks.append( |
| self.markdown.htmlStash.store('\n\n'.join(items))) |
| items = [] |
| |
| if items: |
| if self.markdown_in_raw and 'markdown' in attrs.keys(): |
| items[0] = items[0][left_index:] |
| items[-1] = items[-1][:-len(right_tag) - 2] |
| if items[len(items) - 1]: # not a newline/empty string |
| right_index = len(items) + 3 |
| else: |
| right_index = len(items) + 2 |
| new_blocks.append( |
| self.markdown.htmlStash.store_tag( |
| left_tag, attrs, 0, right_index)) |
| placeholderslen = len(self.markdown.htmlStash.tag_data) |
| new_blocks.extend(self._nested_markdown_in_html(items)) |
| nests = len(self.markdown.htmlStash.tag_data) - placeholderslen |
| self.markdown.htmlStash.tag_data[-1 - nests][ |
| 'right_index'] += nests - 2 |
| else: |
| new_blocks.append( |
| self.markdown.htmlStash.store('\n\n'.join(items))) |
| new_blocks.append('\n') |
| |
| new_text = "\n\n".join(new_blocks) |
| return new_text.split("\n") |
| |
| |
| class ReferencePreprocessor(Preprocessor): |
| """ Remove reference definitions from text and store for later use. """ |
| |
| TITLE = r'[ ]*(\"(.*)\"|\'(.*)\'|\((.*)\))[ ]*' |
| RE = re.compile( |
| r'^[ ]{0,3}\[([^\]]*)\]:\s*([^ ]*)[ ]*(%s)?$' % TITLE, re.DOTALL |
| ) |
| TITLE_RE = re.compile(r'^%s$' % TITLE) |
| |
| def run(self, lines): |
| new_text = [] |
| while lines: |
| line = lines.pop(0) |
| m = self.RE.match(line) |
| if m: |
| id = m.group(1).strip().lower() |
| link = m.group(2).lstrip('<').rstrip('>') |
| t = m.group(5) or m.group(6) or m.group(7) |
| if not t: |
| # Check next line for title |
| tm = self.TITLE_RE.match(lines[0]) |
| if tm: |
| lines.pop(0) |
| t = tm.group(2) or tm.group(3) or tm.group(4) |
| self.markdown.references[id] = (link, t) |
| else: |
| new_text.append(line) |
| |
| return new_text # + "\n" |