| # markdown/searializers.py |
| # |
| # Add x/html serialization to Elementree |
| # Taken from ElementTree 1.3 preview with slight modifications |
| # |
| # Copyright (c) 1999-2007 by Fredrik Lundh. All rights reserved. |
| # |
| # fredrik@pythonware.com |
| # https://www.pythonware.com/ |
| # |
| # -------------------------------------------------------------------- |
| # The ElementTree toolkit is |
| # |
| # Copyright (c) 1999-2007 by Fredrik Lundh |
| # |
| # By obtaining, using, and/or copying this software and/or its |
| # associated documentation, you agree that you have read, understood, |
| # and will comply with the following terms and conditions: |
| # |
| # Permission to use, copy, modify, and distribute this software and |
| # its associated documentation for any purpose and without fee is |
| # hereby granted, provided that the above copyright notice appears in |
| # all copies, and that both that copyright notice and this permission |
| # notice appear in supporting documentation, and that the name of |
| # Secret Labs AB or the author not be used in advertising or publicity |
| # pertaining to distribution of the software without specific, written |
| # prior permission. |
| # |
| # SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD |
| # TO THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANT- |
| # ABILITY AND FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR |
| # BE LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY |
| # DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, |
| # WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS |
| # ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE |
| # OF THIS SOFTWARE. |
| # -------------------------------------------------------------------- |
| |
| |
| from xml.etree.ElementTree import ProcessingInstruction |
| from xml.etree.ElementTree import Comment, ElementTree, QName |
| import re |
| |
| __all__ = ['to_html_string', 'to_xhtml_string'] |
| |
| HTML_EMPTY = ("area", "base", "basefont", "br", "col", "frame", "hr", |
| "img", "input", "isindex", "link", "meta", "param") |
| RE_AMP = re.compile(r'&(?!(?:\#[0-9]+|\#x[0-9a-f]+|[0-9a-z]+);)', re.I) |
| |
| try: |
| HTML_EMPTY = set(HTML_EMPTY) |
| except NameError: # pragma: no cover |
| pass |
| |
| |
| def _raise_serialization_error(text): # pragma: no cover |
| raise TypeError( |
| "cannot serialize {!r} (type {})".format(text, type(text).__name__) |
| ) |
| |
| |
| def _escape_cdata(text): |
| # escape character data |
| try: |
| # it's worth avoiding do-nothing calls for strings that are |
| # shorter than 500 character, or so. assume that's, by far, |
| # the most common case in most applications. |
| if "&" in text: |
| # Only replace & when not part of an entity |
| text = RE_AMP.sub('&', text) |
| if "<" in text: |
| text = text.replace("<", "<") |
| if ">" in text: |
| text = text.replace(">", ">") |
| return text |
| except (TypeError, AttributeError): # pragma: no cover |
| _raise_serialization_error(text) |
| |
| |
| def _escape_attrib(text): |
| # escape attribute value |
| try: |
| if "&" in text: |
| # Only replace & when not part of an entity |
| text = RE_AMP.sub('&', text) |
| if "<" in text: |
| text = text.replace("<", "<") |
| if ">" in text: |
| text = text.replace(">", ">") |
| if "\"" in text: |
| text = text.replace("\"", """) |
| if "\n" in text: |
| text = text.replace("\n", " ") |
| return text |
| except (TypeError, AttributeError): # pragma: no cover |
| _raise_serialization_error(text) |
| |
| |
| def _escape_attrib_html(text): |
| # escape attribute value |
| try: |
| if "&" in text: |
| # Only replace & when not part of an entity |
| text = RE_AMP.sub('&', text) |
| if "<" in text: |
| text = text.replace("<", "<") |
| if ">" in text: |
| text = text.replace(">", ">") |
| if "\"" in text: |
| text = text.replace("\"", """) |
| return text |
| except (TypeError, AttributeError): # pragma: no cover |
| _raise_serialization_error(text) |
| |
| |
| def _serialize_html(write, elem, format): |
| tag = elem.tag |
| text = elem.text |
| if tag is Comment: |
| write("<!--%s-->" % _escape_cdata(text)) |
| elif tag is ProcessingInstruction: |
| write("<?%s?>" % _escape_cdata(text)) |
| elif tag is None: |
| if text: |
| write(_escape_cdata(text)) |
| for e in elem: |
| _serialize_html(write, e, format) |
| else: |
| namespace_uri = None |
| if isinstance(tag, QName): |
| # QNAME objects store their data as a string: `{uri}tag` |
| if tag.text[:1] == "{": |
| namespace_uri, tag = tag.text[1:].split("}", 1) |
| else: |
| raise ValueError('QName objects must define a tag.') |
| write("<" + tag) |
| items = elem.items() |
| if items: |
| items = sorted(items) # lexical order |
| for k, v in items: |
| if isinstance(k, QName): |
| # Assume a text only QName |
| k = k.text |
| if isinstance(v, QName): |
| # Assume a text only QName |
| v = v.text |
| else: |
| v = _escape_attrib_html(v) |
| if k == v and format == 'html': |
| # handle boolean attributes |
| write(" %s" % v) |
| else: |
| write(' {}="{}"'.format(k, v)) |
| if namespace_uri: |
| write(' xmlns="%s"' % (_escape_attrib(namespace_uri))) |
| if format == "xhtml" and tag.lower() in HTML_EMPTY: |
| write(" />") |
| else: |
| write(">") |
| if text: |
| if tag.lower() in ["script", "style"]: |
| write(text) |
| else: |
| write(_escape_cdata(text)) |
| for e in elem: |
| _serialize_html(write, e, format) |
| if tag.lower() not in HTML_EMPTY: |
| write("</" + tag + ">") |
| if elem.tail: |
| write(_escape_cdata(elem.tail)) |
| |
| |
| def _write_html(root, format="html"): |
| assert root is not None |
| data = [] |
| write = data.append |
| _serialize_html(write, root, format) |
| return "".join(data) |
| |
| |
| # -------------------------------------------------------------------- |
| # public functions |
| |
| def to_html_string(element): |
| return _write_html(ElementTree(element).getroot(), format="html") |
| |
| |
| def to_xhtml_string(element): |
| return _write_html(ElementTree(element).getroot(), format="xhtml") |