third_party/blink/web_tests/external/wpt/tools/third_party/html5lib/html5lib/tests/test_serializer.py - chromium/src - Git at Google

 from __future__ import absolute_import, division, unicode_literals

 import os
 import json

 import pytest

 from .support import get_data_files

 from html5lib import constants
 from html5lib.filters.lint import Filter as Lint
 from html5lib.serializer import HTMLSerializer, serialize
 from html5lib.treewalkers.base import TreeWalker

 # pylint:disable=wrong-import-position
 optionals_loaded = []

 try:
     from lxml import etree
     optionals_loaded.append("lxml")
 except ImportError:
     pass
 # pylint:enable=wrong-import-position

 default_namespace = constants.namespaces["html"]


 class JsonWalker(TreeWalker):
     def __iter__(self):
         for token in self.tree:
             type = token[0]
             if type == "StartTag":
                 if len(token) == 4:
                     namespace, name, attrib = token[1:4]
                 else:
                     namespace = default_namespace
                     name, attrib = token[1:3]
                 yield self.startTag(namespace, name, self._convertAttrib(attrib))
             elif type == "EndTag":
                 if len(token) == 3:
                     namespace, name = token[1:3]
                 else:
                     namespace = default_namespace
                     name = token[1]
                 yield self.endTag(namespace, name)
             elif type == "EmptyTag":
                 if len(token) == 4:
                     namespace, name, attrib = token[1:]
                 else:
                     namespace = default_namespace
                     name, attrib = token[1:]
                 for token in self.emptyTag(namespace, name, self._convertAttrib(attrib)):
                     yield token
             elif type == "Comment":
                 yield self.comment(token[1])
             elif type in ("Characters", "SpaceCharacters"):
                 for token in self.text(token[1]):
                     yield token
             elif type == "Doctype":
                 if len(token) == 4:
                     yield self.doctype(token[1], token[2], token[3])
                 elif len(token) == 3:
                     yield self.doctype(token[1], token[2])
                 else:
                     yield self.doctype(token[1])
             else:
                 raise ValueError("Unknown token type: " + type)

     def _convertAttrib(self, attribs):
         """html5lib tree-walkers use a dict of (namespace, name): value for
         attributes, but JSON cannot represent this. Convert from the format
         in the serializer tests (a list of dicts with "namespace", "name",
         and "value" as keys) to html5lib's tree-walker format."""
         attrs = {}
         for attrib in attribs:
             name = (attrib["namespace"], attrib["name"])
             assert(name not in attrs)
             attrs[name] = attrib["value"]
         return attrs


 def serialize_html(input, options):
     options = dict([(str(k), v) for k, v in options.items()])
     encoding = options.get("encoding", None)
     if "encoding" in options:
         del options["encoding"]
     stream = Lint(JsonWalker(input), False)
     serializer = HTMLSerializer(alphabetical_attributes=True, **options)
     return serializer.render(stream, encoding)


 def runSerializerTest(input, expected, options):
     encoding = options.get("encoding", None)

     if encoding:
         expected = list(map(lambda x: x.encode(encoding), expected))

     result = serialize_html(input, options)
     if len(expected) == 1:
         assert expected[0] == result, "Expected:\n%s\nActual:\n%s\nOptions:\n%s" % (expected[0], result, str(options))
     elif result not in expected:
         assert False, "Expected: %s, Received: %s" % (expected, result)


 def throwsWithLatin1(input):
     with pytest.raises(UnicodeEncodeError):
         serialize_html(input, {"encoding": "iso-8859-1"})


 def testDoctypeName():
     throwsWithLatin1([["Doctype", "\u0101"]])


 def testDoctypePublicId():
     throwsWithLatin1([["Doctype", "potato", "\u0101"]])


 def testDoctypeSystemId():
     throwsWithLatin1([["Doctype", "potato", "potato", "\u0101"]])


 def testCdataCharacters():
     runSerializerTest([["StartTag", "http://www.w3.org/1999/xhtml", "style", {}], ["Characters", "\u0101"]],
                       ["<style>&amacr;"], {"encoding": "iso-8859-1"})


 def testCharacters():
     runSerializerTest([["Characters", "\u0101"]],
                       ["&amacr;"], {"encoding": "iso-8859-1"})


 def testStartTagName():
     throwsWithLatin1([["StartTag", "http://www.w3.org/1999/xhtml", "\u0101", []]])


 def testAttributeName():
     throwsWithLatin1([["StartTag", "http://www.w3.org/1999/xhtml", "span", [{"namespace": None, "name": "\u0101", "value": "potato"}]]])


 def testAttributeValue():
     runSerializerTest([["StartTag", "http://www.w3.org/1999/xhtml", "span",
                         [{"namespace": None, "name": "potato", "value": "\u0101"}]]],
                       ["<span potato=&amacr;>"], {"encoding": "iso-8859-1"})


 def testEndTagName():
     throwsWithLatin1([["EndTag", "http://www.w3.org/1999/xhtml", "\u0101"]])


 def testComment():
     throwsWithLatin1([["Comment", "\u0101"]])


 def testThrowsUnknownOption():
     with pytest.raises(TypeError):
         HTMLSerializer(foobar=None)


 @pytest.mark.parametrize("c", list("\t\n\u000C\x20\r\"'=<>`"))
 def testSpecQuoteAttribute(c):
     input_ = [["StartTag", "http://www.w3.org/1999/xhtml", "span",
                [{"namespace": None, "name": "foo", "value": c}]]]
     if c == '"':
         output_ = ["<span foo='%s'>" % c]
     else:
         output_ = ['<span foo="%s">' % c]
     options_ = {"quote_attr_values": "spec"}
     runSerializerTest(input_, output_, options_)


 @pytest.mark.parametrize("c", list("\t\n\u000C\x20\r\"'=<>`"
                                    "\x00\x01\x02\x03\x04\x05\x06\x07\x08\t\n"
                                    "\x0b\x0c\r\x0e\x0f\x10\x11\x12\x13\x14\x15"
                                    "\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f"
                                    "\x20\x2f\x60\xa0\u1680\u180e\u180f\u2000"
                                    "\u2001\u2002\u2003\u2004\u2005\u2006\u2007"
                                    "\u2008\u2009\u200a\u2028\u2029\u202f\u205f"
                                    "\u3000"))
 def testLegacyQuoteAttribute(c):
     input_ = [["StartTag", "http://www.w3.org/1999/xhtml", "span",
                [{"namespace": None, "name": "foo", "value": c}]]]
     if c == '"':
         output_ = ["<span foo='%s'>" % c]
     else:
         output_ = ['<span foo="%s">' % c]
     options_ = {"quote_attr_values": "legacy"}
     runSerializerTest(input_, output_, options_)


 @pytest.fixture
 def lxml_parser():
     return etree.XMLParser(resolve_entities=False)


 @pytest.mark.skipif("lxml" not in optionals_loaded, reason="lxml not importable")
 def testEntityReplacement(lxml_parser):
     doc = '<!DOCTYPE html SYSTEM "about:legacy-compat"><html>&beta;</html>'
     tree = etree.fromstring(doc, parser=lxml_parser).getroottree()
     result = serialize(tree, tree="lxml", omit_optional_tags=False)
     assert result == '<!DOCTYPE html SYSTEM "about:legacy-compat"><html>\u03B2</html>'


 @pytest.mark.skipif("lxml" not in optionals_loaded, reason="lxml not importable")
 def testEntityXML(lxml_parser):
     doc = '<!DOCTYPE html SYSTEM "about:legacy-compat"><html>&gt;</html>'
     tree = etree.fromstring(doc, parser=lxml_parser).getroottree()
     result = serialize(tree, tree="lxml", omit_optional_tags=False)
     assert result == '<!DOCTYPE html SYSTEM "about:legacy-compat"><html>&gt;</html>'


 @pytest.mark.skipif("lxml" not in optionals_loaded, reason="lxml not importable")
 def testEntityNoResolve(lxml_parser):
     doc = '<!DOCTYPE html SYSTEM "about:legacy-compat"><html>&beta;</html>'
     tree = etree.fromstring(doc, parser=lxml_parser).getroottree()
     result = serialize(tree, tree="lxml", omit_optional_tags=False,
                                   resolve_entities=False)
     assert result == '<!DOCTYPE html SYSTEM "about:legacy-compat"><html>&beta;</html>'


 def test_serializer():
     for filename in get_data_files('serializer-testdata', '*.test', os.path.dirname(__file__)):
         with open(filename) as fp:
             tests = json.load(fp)
             for test in tests['tests']:
                 yield runSerializerTest, test["input"], test["expected"], test.get("options", {})
	from __future__ import absolute_import, division, unicode_literals

	import os
	import json

	import pytest

	from .support import get_data_files

	from html5lib import constants
	from html5lib.filters.lint import Filter as Lint
	from html5lib.serializer import HTMLSerializer, serialize
	from html5lib.treewalkers.base import TreeWalker

	# pylint:disable=wrong-import-position
	optionals_loaded = []

	try:
	from lxml import etree
	optionals_loaded.append("lxml")
	except ImportError:
	pass
	# pylint:enable=wrong-import-position

	default_namespace = constants.namespaces["html"]


	class JsonWalker(TreeWalker):
	def __iter__(self):
	for token in self.tree:
	type = token[0]
	if type == "StartTag":
	if len(token) == 4:
	namespace, name, attrib = token[1:4]
	else:
	namespace = default_namespace
	name, attrib = token[1:3]
	yield self.startTag(namespace, name, self._convertAttrib(attrib))
	elif type == "EndTag":
	if len(token) == 3:
	namespace, name = token[1:3]
	else:
	namespace = default_namespace
	name = token[1]
	yield self.endTag(namespace, name)
	elif type == "EmptyTag":
	if len(token) == 4:
	namespace, name, attrib = token[1:]
	else:
	namespace = default_namespace
	name, attrib = token[1:]
	for token in self.emptyTag(namespace, name, self._convertAttrib(attrib)):
	yield token
	elif type == "Comment":
	yield self.comment(token[1])
	elif type in ("Characters", "SpaceCharacters"):
	for token in self.text(token[1]):
	yield token
	elif type == "Doctype":
	if len(token) == 4:
	yield self.doctype(token[1], token[2], token[3])
	elif len(token) == 3:
	yield self.doctype(token[1], token[2])
	else:
	yield self.doctype(token[1])
	else:
	raise ValueError("Unknown token type: " + type)

	def _convertAttrib(self, attribs):
	"""html5lib tree-walkers use a dict of (namespace, name): value for
	attributes, but JSON cannot represent this. Convert from the format
	in the serializer tests (a list of dicts with "namespace", "name",
	and "value" as keys) to html5lib's tree-walker format."""
	attrs = {}
	for attrib in attribs:
	name = (attrib["namespace"], attrib["name"])
	assert(name not in attrs)
	attrs[name] = attrib["value"]
	return attrs


	def serialize_html(input, options):
	options = dict([(str(k), v) for k, v in options.items()])
	encoding = options.get("encoding", None)
	if "encoding" in options:
	del options["encoding"]
	stream = Lint(JsonWalker(input), False)
	serializer = HTMLSerializer(alphabetical_attributes=True, **options)
	return serializer.render(stream, encoding)


	def runSerializerTest(input, expected, options):
	encoding = options.get("encoding", None)

	if encoding:
	expected = list(map(lambda x: x.encode(encoding), expected))

	result = serialize_html(input, options)
	if len(expected) == 1:
	assert expected[0] == result, "Expected:\n%s\nActual:\n%s\nOptions:\n%s" % (expected[0], result, str(options))
	elif result not in expected:
	assert False, "Expected: %s, Received: %s" % (expected, result)


	def throwsWithLatin1(input):
	with pytest.raises(UnicodeEncodeError):
	serialize_html(input, {"encoding": "iso-8859-1"})


	def testDoctypeName():
	throwsWithLatin1([["Doctype", "\u0101"]])


	def testDoctypePublicId():
	throwsWithLatin1([["Doctype", "potato", "\u0101"]])


	def testDoctypeSystemId():
	throwsWithLatin1([["Doctype", "potato", "potato", "\u0101"]])


	def testCdataCharacters():
	runSerializerTest([["StartTag", "http://www.w3.org/1999/xhtml", "style", {}], ["Characters", "\u0101"]],
	["<style>&amacr;"], {"encoding": "iso-8859-1"})


	def testCharacters():
	runSerializerTest([["Characters", "\u0101"]],
	["&amacr;"], {"encoding": "iso-8859-1"})


	def testStartTagName():
	throwsWithLatin1([["StartTag", "http://www.w3.org/1999/xhtml", "\u0101", []]])


	def testAttributeName():
	throwsWithLatin1([["StartTag", "http://www.w3.org/1999/xhtml", "span", [{"namespace": None, "name": "\u0101", "value": "potato"}]]])


	def testAttributeValue():
	runSerializerTest([["StartTag", "http://www.w3.org/1999/xhtml", "span",
	[{"namespace": None, "name": "potato", "value": "\u0101"}]]],
	["<span potato=&amacr;>"], {"encoding": "iso-8859-1"})


	def testEndTagName():
	throwsWithLatin1([["EndTag", "http://www.w3.org/1999/xhtml", "\u0101"]])


	def testComment():
	throwsWithLatin1([["Comment", "\u0101"]])


	def testThrowsUnknownOption():
	with pytest.raises(TypeError):
	HTMLSerializer(foobar=None)


	@pytest.mark.parametrize("c", list("\t\n\u000C\x20\r\"'=<>`"))
	def testSpecQuoteAttribute(c):
	input_ = [["StartTag", "http://www.w3.org/1999/xhtml", "span",
	[{"namespace": None, "name": "foo", "value": c}]]]
	if c == '"':
	output_ = ["<span foo='%s'>" % c]
	else:
	output_ = ['<span foo="%s">' % c]
	options_ = {"quote_attr_values": "spec"}
	runSerializerTest(input_, output_, options_)


	@pytest.mark.parametrize("c", list("\t\n\u000C\x20\r\"'=<>`"
	"\x00\x01\x02\x03\x04\x05\x06\x07\x08\t\n"
	"\x0b\x0c\r\x0e\x0f\x10\x11\x12\x13\x14\x15"
	"\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f"
	"\x20\x2f\x60\xa0\u1680\u180e\u180f\u2000"
	"\u2001\u2002\u2003\u2004\u2005\u2006\u2007"
	"\u2008\u2009\u200a\u2028\u2029\u202f\u205f"
	"\u3000"))
	def testLegacyQuoteAttribute(c):
	input_ = [["StartTag", "http://www.w3.org/1999/xhtml", "span",
	[{"namespace": None, "name": "foo", "value": c}]]]
	if c == '"':
	output_ = ["<span foo='%s'>" % c]
	else:
	output_ = ['<span foo="%s">' % c]
	options_ = {"quote_attr_values": "legacy"}
	runSerializerTest(input_, output_, options_)


	@pytest.fixture
	def lxml_parser():
	return etree.XMLParser(resolve_entities=False)


	@pytest.mark.skipif("lxml" not in optionals_loaded, reason="lxml not importable")
	def testEntityReplacement(lxml_parser):
	doc = '<!DOCTYPE html SYSTEM "about:legacy-compat"><html>β</html>'
	tree = etree.fromstring(doc, parser=lxml_parser).getroottree()
	result = serialize(tree, tree="lxml", omit_optional_tags=False)
	assert result == '<!DOCTYPE html SYSTEM "about:legacy-compat"><html>\u03B2</html>'


	@pytest.mark.skipif("lxml" not in optionals_loaded, reason="lxml not importable")
	def testEntityXML(lxml_parser):
	doc = '<!DOCTYPE html SYSTEM "about:legacy-compat"><html>></html>'
	tree = etree.fromstring(doc, parser=lxml_parser).getroottree()
	result = serialize(tree, tree="lxml", omit_optional_tags=False)
	assert result == '<!DOCTYPE html SYSTEM "about:legacy-compat"><html>></html>'


	@pytest.mark.skipif("lxml" not in optionals_loaded, reason="lxml not importable")
	def testEntityNoResolve(lxml_parser):
	doc = '<!DOCTYPE html SYSTEM "about:legacy-compat"><html>β</html>'
	tree = etree.fromstring(doc, parser=lxml_parser).getroottree()
	result = serialize(tree, tree="lxml", omit_optional_tags=False,
	resolve_entities=False)
	assert result == '<!DOCTYPE html SYSTEM "about:legacy-compat"><html>β</html>'


	def test_serializer():
	for filename in get_data_files('serializer-testdata', '*.test', os.path.dirname(__file__)):
	with open(filename) as fp:
	tests = json.load(fp)
	for test in tests['tests']:
	yield runSerializerTest, test["input"], test["expected"], test.get("options", {})