| #!/usr/bin/python | |
| # CSS Test Source Manipulation Library | |
| # Initial code by fantasai, joint copyright 2010 W3C and Microsoft | |
| # additions by peter.linss@hp.com copyright 2013 Hewlett-Packard | |
| # Licensed under BSD 3-Clause: <http://www.w3.org/Consortium/Legal/2008/03-bsd-license> | |
| import lxml | |
| from lxml import etree | |
| import htmlentitydefs | |
| import copy | |
| class HTMLSerializer(object): | |
| gXMLns = 'http://www.w3.org/XML/1998/namespace' | |
| gHTMLns = 'http://www.w3.org/1999/xhtml' | |
| gDefaultNamespaces = {'http://www.w3.org/XML/1998/namespace': 'xmlns', | |
| 'http://www.w3.org/2000/xmlns/': 'xmlns', | |
| 'http://www.w3.org/1999/xlink': 'xlink'} | |
| gVoidElements = frozenset(( | |
| 'base', | |
| 'command', | |
| 'event-source', | |
| 'link', | |
| 'meta', | |
| 'hr', | |
| 'br', | |
| 'img', | |
| 'embed', | |
| 'param', | |
| 'area', | |
| 'col', | |
| 'input', | |
| 'source' | |
| )) | |
| gCDataElements = frozenset(( | |
| 'style', | |
| 'script' | |
| )) | |
| gInvisibleChars = frozenset( | |
| # ASCII control chars | |
| range(0x0, 0x9) + range(0xB, 0xD) + range(0xE, 0x20) + | |
| # Other control chars | |
| # fixed-width spaces, zero-width marks, bidi marks | |
| range(0x2000, 0x2010) + | |
| # LS, PS, bidi control codes | |
| range(0x2028, 0x2030) + | |
| # nbsp, mathsp, ideosp, WJ, interlinear | |
| [0x00A0, 0x205F, 0x3000, 0x2060, 0xFFF9, 0xFFFA, 0xFFFB] | |
| ) | |
| gXMLEscapes = frozenset(gInvisibleChars | | |
| frozenset((ord('&'), ord('<'), ord('>')))) | |
| gXMLEntityNames = {'"': 'quot', '&': 'amp', "'": 'apos', '<': 'lt', '>': 'gt'} | |
| gDocTypes = { | |
| 'html': '<!DOCTYPE html>', | |
| 'html4': | |
| '<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01//EN" "http://www.w3.org/TR/html4/strict.dtd">', | |
| 'html4-transitional': | |
| '<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">', | |
| 'html4-frameset': | |
| '<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01 Frameset//EN" "http://www.w3.org/TR/html4/frameset.dtd">', | |
| 'svg11': | |
| '<!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1 Basic//EN" "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11-basic.dtd">', | |
| 'svg11-tiny': | |
| '<!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1 Tiny//EN" "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11-tiny.dtd">', | |
| 'xhtml10': | |
| '<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">', | |
| 'xhtml10-transitional': | |
| '<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">', | |
| 'xhtml10-frameset': | |
| '<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Frameset//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-frameset.dtd">', | |
| 'xhtml11': | |
| '<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN" "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">', | |
| 'xhtml-basic11': | |
| '<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML Basic 1.1//EN" "http://www.w3.org/TR/xhtml-basic/xhtml-basic11.dtd">' | |
| } | |
| def __init__(self): | |
| self._reset() | |
| def _reset(self, xhtml = False): | |
| self.mOutput = u'' | |
| self.mXHTML = xhtml | |
| def _output(self, *args): | |
| for arg in args: | |
| self.mOutput += unicode(arg) | |
| def _escape(self, text, escapeChars): | |
| # This algorithm is O(MN) for M len(text) and N num escapable | |
| # But it doesn't modify the text when N is zero (common case) and | |
| # N is expected to be small (usually 1 or 2) in most other cases. | |
| escapable = set() | |
| for char in text: | |
| if ord(char) in escapeChars: | |
| escapable.add(char) | |
| for char in escapable: | |
| if (self.mXHTML): | |
| name = self.gXMLEntityNames.get(char) | |
| else: | |
| name = htmlentitydefs.codepoint2name.get(ord(char)) | |
| escape = u'&%s;' % name if name else u'&#x%X;' % ord(char) | |
| text = text.replace(char, escape) | |
| return text | |
| def _escapeXML(self, text): | |
| return self._escape(text, self.gXMLEscapes) | |
| def _escapeInvisible(self, text): | |
| return self._escape(text, self.gInvisibleChars) | |
| def _serializeElement(self, element, namespacePrefixes): | |
| qName = etree.QName(element) | |
| attrs = element.attrib.items() # in tree order | |
| if (not namespacePrefixes): | |
| namespacePrefixes = self.gDefaultNamespaces | |
| if (self.mXHTML): | |
| namespacePrefixes = copy.copy(namespacePrefixes) | |
| for attr, value in attrs: | |
| attrQName = etree.QName(attr) | |
| if (self.gXMLns == attrQName.namespace): | |
| namespacePrefixes[value] = attrQName.localname | |
| elif ('xmlns' == attrQName.localname): | |
| namespacePrefixes[value] = '' | |
| if (self.mXHTML and qName.namespace and namespacePrefixes[qName.namespace]): | |
| self._output('<', namespacePrefixes[qName.namespace], ':', qName.localname) | |
| else: | |
| self._output('<', qName.localname) | |
| for attr, value in attrs: | |
| attrQName = etree.QName(attr) | |
| if ((attrQName.namespace == self.gXMLns) and ('lang' == attrQName.localname)): | |
| if (self.mXHTML): | |
| attr = 'xml:lang' | |
| else: | |
| attr = 'lang' | |
| elif (attrQName.namespace and namespacePrefixes[attrQName.namespace]): | |
| attr = namespacePrefixes[attrQName.namespace] + ':' + attrQName.localname | |
| else: | |
| attr = attrQName.localname | |
| self._output(' ', attr, '=') | |
| value = value.replace('&', '&') | |
| if (self.mXHTML): | |
| value = value.replace('<', '<') | |
| if (('"' in value) and ("'" not in value)): | |
| self._output("'", self._escapeInvisible(value), "'") | |
| else: | |
| self._output('"', self._escapeInvisible(value.replace('"', '"')), '"') | |
| if ((qName.namespace == self.gHTMLns) and (qName.localname in self.gVoidElements)): | |
| if (self.mXHTML): | |
| self._output(' />') | |
| else: | |
| self._output('>') | |
| else: | |
| self._output('>') | |
| if (None != element.text): | |
| if ((qName.namespace == self.gHTMLns) and (qName.localname in self.gCDataElements)): | |
| if (self.mXHTML): | |
| self._output(self._escapeXML(element.text)) # or self._output('<![CDATA[', element.text, ']]>') | |
| else: | |
| self._output(element.text) | |
| else: | |
| self._output(self._escapeXML(element.text)) | |
| for child in list(element): | |
| self._serializeNode(child, namespacePrefixes) | |
| self._output('</', qName.localname, '>') | |
| if (None != element.tail): | |
| self._output(self._escapeXML(element.tail)) | |
| def _serializeEntity(self, entity): | |
| self._output(entity.text) | |
| if (None != entity.tail): | |
| self._output(self._escapeXML(entity.tail)) | |
| def _serializePI(self, pi): | |
| if (self.mXHTML): | |
| self._output('<?', pi.target, ' ', pi.text, '?>') | |
| else: | |
| raise Exception("Processing Instructions can't be converted to HTML") | |
| if (None != pi.tail): | |
| self._output(self._escapeXML(pi.tail)) | |
| def _serializeComment(self, comment): | |
| self._output('<!--', comment.text, '-->') # XXX escape comment? | |
| if (None != comment.tail): | |
| self._output(self._escapeXML(comment.tail)) | |
| def _serializeNode(self, node, namespacePrefixes = None): | |
| if (isinstance(node, etree._Entity)): | |
| self._serializeEntity(node) | |
| elif (isinstance(node, etree._ProcessingInstruction)): | |
| self._serializePI(node) | |
| elif (isinstance(node, etree._Comment)): | |
| self._serializeComment(node) | |
| else: | |
| self._serializeElement(node, namespacePrefixes) | |
| def _serializeTree(self, tree): | |
| root = tree.getroot() | |
| preceding = [node for node in root.itersiblings(preceding = True)] | |
| preceding.reverse() | |
| for node in preceding: | |
| self._serializeNode(node) | |
| self._serializeNode(root) | |
| for node in root.itersiblings(): | |
| self._serializeNode(node) | |
| def _serializeDoctype(self, tree, doctype, default): | |
| if (doctype): | |
| self._output(self.gDocTypes[doctype], '\n') | |
| else: | |
| if (hasattr(tree, 'docinfo') and tree.docinfo and tree.docinfo.doctype): | |
| doctypeSearch = tree.docinfo.doctype.lower() | |
| for doctype in self.gDocTypes: | |
| if (self.gDocTypes[doctype].lower() == doctypeSearch): | |
| break | |
| else: | |
| doctype = None | |
| if (self.mXHTML): | |
| if ('html' == doctype): | |
| doctype = 'xhtml10' | |
| elif ('html4' == doctype): | |
| doctype = 'xhtml10' | |
| elif ('html4-transitional' == doctype): | |
| doctype = 'xhtml10-transitional' | |
| elif ('html4-frameset' == doctype): | |
| doctype = 'xhtml10-frameset' | |
| else: | |
| if ('xhtml10' == doctype): | |
| doctype = 'html4' | |
| elif ('xhtml10-transitional' == doctype): | |
| doctype = 'html4-transitional' | |
| elif ('xhtml10-frameset' == doctype): | |
| doctype = 'html4-frameset' | |
| elif ('xhtml11' == doctype): | |
| doctype = 'html4' | |
| if (doctype): | |
| self._output(self.gDocTypes[doctype], '\n') | |
| else: | |
| self._output(tree.docinfo.doctype, '\n') | |
| else: | |
| self._output(self.gDocTypes[default], '\n') | |
| def serializeHTML(self, tree, doctype = None): | |
| self._reset() | |
| self._serializeDoctype(tree, doctype, 'html') | |
| self._serializeTree(tree) | |
| return self.mOutput | |
| def serializeXHTML(self, tree, doctype = None): | |
| self._reset(True) | |
| # XXX '<!xml ...' ?? | |
| self._serializeDoctype(tree, doctype, 'xhtml11') | |
| self._serializeTree(tree) | |
| return self.mOutput | |