test/test_html.doctest - external/github.com/jjlee/mechanize - Git at Google

 >>> import mechanize
 >>> from mechanize._response import test_html_response
 >>> from mechanize._html import LinksFactory, FormsFactory, TitleFactory, \
 ... MechanizeBs, \
 ... RobustLinksFactory,  RobustFormsFactory, RobustTitleFactory

 mechanize.ParseError should be raised on parsing erroneous HTML.

 For backwards compatibility, mechanize.ParseError derives from
 exception classes that mechanize used to raise, prior to version
 0.1.6.

 >>> import sgmllib
 >>> import HTMLParser
 >>> issubclass(mechanize.ParseError, sgmllib.SGMLParseError)
 True
 >>> issubclass(mechanize.ParseError, HTMLParser.HTMLParseError)
 True

 >>> def create_response(error=True):
 ...     extra = ""
 ...     if error:
 ...         extra = "<!!!>"
 ...     html = """\
 ... <html>
 ... <head>
 ...     <title>Title</title>
 ...     %s
 ... </head>
 ... <body>
 ...     <p>Hello world
 ... </body>
 ... </html>
 ... """ % extra
 ...     return test_html_response(html)

 >>> f = LinksFactory()
 >>> f.set_response(create_response(), "http://example.com", "latin-1")
 >>> list(f.links())  # doctest: +IGNORE_EXCEPTION_DETAIL
 Traceback (most recent call last):
 ParseError:
 >>> f = FormsFactory()
 >>> f.set_response(create_response(), "latin-1")
 >>> list(f.forms())  # doctest: +IGNORE_EXCEPTION_DETAIL
 Traceback (most recent call last):
 ParseError:
 >>> f = TitleFactory()
 >>> f.set_response(create_response(), "latin-1")
 >>> f.title()  # doctest: +IGNORE_EXCEPTION_DETAIL
 Traceback (most recent call last):
 ParseError:


 Accessing attributes on Factory may also raise ParseError

 >>> def factory_getattr(attr_name):
 ...    fact = mechanize.DefaultFactory()
 ...    fact.set_response(create_response())
 ...    getattr(fact, attr_name)
 >>> factory_getattr("title")  # doctest: +IGNORE_EXCEPTION_DETAIL
 Traceback (most recent call last):
 ParseError:
 >>> factory_getattr("global_form")  # doctest: +IGNORE_EXCEPTION_DETAIL
 Traceback (most recent call last):
 ParseError:


 BeautifulSoup ParseErrors:

 XXX If I could come up with examples that break links and forms
 parsing, I'd uncomment these!

 >>> def create_soup(html):
 ...     r = test_html_response(html)
 ...     return MechanizeBs("latin-1", r.read())

 #>>> f = RobustLinksFactory()
 #>>> html = """\
 #... <a href="a">
 #... <frame src="b">
 #... <a href="c">
 #... <iframe src="d">
 #... </a>
 #... </area>
 #... </frame>
 #... """
 #>>> f.set_soup(create_soup(html), "http://example.com", "latin-1")
 #>>> list(f.links())  # doctest: +IGNORE_EXCEPTION_DETAIL
 #Traceback (most recent call last):
 #ParseError:

 #>>> html = """\
 #... <table>
 #... <tr><td>
 #... <input name='broken'>
 #... </td>
 #... </form>
 #... </tr>
 #... </form>
 #... """
 #>>> f = RobustFormsFactory()
 #>>> f.set_response(create_response(), "latin-1")
 #>>> list(f.forms())  # doctest: +IGNORE_EXCEPTION_DETAIL
 #Traceback (most recent call last):
 #ParseError:

 #>>> f = RobustTitleFactory()
 #>>> f.set_soup(create_soup(""), "latin-1")
 #>>> f.title()  # doctest: +IGNORE_EXCEPTION_DETAIL
 #Traceback (most recent call last):
 #ParseError:


 Utility class for caching forms etc.

 >>> from mechanize._html import CachingGeneratorFunction

 >>> i = [1]
 >>> func = CachingGeneratorFunction(i)
 >>> list(func())
 [1]
 >>> list(func())
 [1]

 >>> i = [1, 2, 3]
 >>> func = CachingGeneratorFunction(i)
 >>> list(func())
 [1, 2, 3]

 >>> i = func()
 >>> i.next()
 1
 >>> i.next()
 2
 >>> i.next()
 3

 >>> i = func()
 >>> j = func()
 >>> i.next()
 1
 >>> j.next()
 1
 >>> i.next()
 2
 >>> j.next()
 2
 >>> j.next()
 3
 >>> i.next()
 3
 >>> i.next()
 Traceback (most recent call last):
 ...
 StopIteration
 >>> j.next()
 Traceback (most recent call last):
 ...
 StopIteration


 Link text parsing

 >>> def get_first_link_text_bs(html):
 ...     factory = RobustLinksFactory()
 ...     soup = MechanizeBs("utf-8", html)
 ...     factory.set_soup(soup, "http://example.com/", "utf-8")
 ...     return list(factory.links())[0].text

 >>> def get_first_link_text_sgmllib(html):
 ...     factory = LinksFactory()
 ...     response = test_html_response(html)
 ...     factory.set_response(response, "http://example.com/", "utf-8")
 ...     return list(factory.links())[0].text

 Whitespace gets compressed down to single spaces.  Tags are removed.

 >>> html = ("""\
 ... <html><head><title>Title</title></head><body>
 ... <p><a href="http://example.com/">The  quick\tbrown fox jumps
 ...   over the <i><b>lazy</b></i> dog </a>
 ... </body></html>
 ... """)
 >>> get_first_link_text_bs(html)
 'The quick brown fox jumps over the lazy dog'
 >>> get_first_link_text_sgmllib(html)
 'The quick brown fox jumps over the lazy dog'

 Empty <a> links have empty link text

 >>> html = ("""\
 ... <html><head><title>Title</title></head><body>
 ... <p><a href="http://example.com/"></a>
 ... </body></html>
 ... """)
 >>> get_first_link_text_bs(html)
 ''
 >>> get_first_link_text_sgmllib(html)
 ''

 But for backwards-compatibility, empty non-<a> links have None link text

 >>> html = ("""\
 ... <html><head><title>Title</title></head><body>
 ... <p><frame src="http://example.com/"></frame>
 ... </body></html>
 ... """)
 >>> print get_first_link_text_bs(html)
 None
 >>> print get_first_link_text_sgmllib(html)
 None


 Title parsing.  We follow Firefox's behaviour with regard to child
 elements (haven't tested IE).

 >>> def get_title_bs(html):
 ...     factory = RobustTitleFactory()
 ...     soup = MechanizeBs("utf-8", html)
 ...     factory.set_soup(soup, "utf-8")
 ...     return factory.title()

 >>> def get_title_sgmllib(html):
 ...     factory = TitleFactory()
 ...     response = test_html_response(html)
 ...     factory.set_response(response, "utf-8")
 ...     return factory.title()

 >>> html = ("""\
 ... <html><head>
 ... <title>Title</title>
 ... </head><body><p>Blah.<p></body></html>
 ... """)
 >>> get_title_bs(html)
 'Title'
 >>> get_title_sgmllib(html)
 'Title'

 >>> html = ("""\
 ... <html><head>
 ... <title>  Ti<script type="text/strange">alert("this is valid HTML -- yuck!")</script>
 ... tle &amp;&#38;
 ... </title>
 ... </head><body><p>Blah.<p></body></html>
 ... """)
 >>> get_title_bs(html)
 'Ti<script type="text/strange">alert("this is valid HTML -- yuck!")</script> tle &&'
 >>> get_title_sgmllib(html)
 'Ti<script type="text/strange">alert("this is valid HTML -- yuck!")</script> tle &&'


 No more tags after <title> used to cause an exception

 >>> html = ("""\
 ... <html><head>
 ... <title>""")
 >>> get_title_sgmllib(html)
 ''
	>>> import mechanize
	>>> from mechanize._response import test_html_response
	>>> from mechanize._html import LinksFactory, FormsFactory, TitleFactory, \
	... MechanizeBs, \
	... RobustLinksFactory, RobustFormsFactory, RobustTitleFactory

	mechanize.ParseError should be raised on parsing erroneous HTML.

	For backwards compatibility, mechanize.ParseError derives from
	exception classes that mechanize used to raise, prior to version
	0.1.6.

	>>> import sgmllib
	>>> import HTMLParser
	>>> issubclass(mechanize.ParseError, sgmllib.SGMLParseError)
	True
	>>> issubclass(mechanize.ParseError, HTMLParser.HTMLParseError)
	True

	>>> def create_response(error=True):
	... extra = ""
	... if error:
	... extra = "<!!!>"
	... html = """\
	... <html>
	... <head>
	... <title>Title</title>
	... %s
	... </head>
	... <body>
	... <p>Hello world
	... </body>
	... </html>
	... """ % extra
	... return test_html_response(html)

	>>> f = LinksFactory()
	>>> f.set_response(create_response(), "http://example.com", "latin-1")
	>>> list(f.links()) # doctest: +IGNORE_EXCEPTION_DETAIL
	Traceback (most recent call last):
	ParseError:
	>>> f = FormsFactory()
	>>> f.set_response(create_response(), "latin-1")
	>>> list(f.forms()) # doctest: +IGNORE_EXCEPTION_DETAIL
	Traceback (most recent call last):
	ParseError:
	>>> f = TitleFactory()
	>>> f.set_response(create_response(), "latin-1")
	>>> f.title() # doctest: +IGNORE_EXCEPTION_DETAIL
	Traceback (most recent call last):
	ParseError:


	Accessing attributes on Factory may also raise ParseError

	>>> def factory_getattr(attr_name):
	... fact = mechanize.DefaultFactory()
	... fact.set_response(create_response())
	... getattr(fact, attr_name)
	>>> factory_getattr("title") # doctest: +IGNORE_EXCEPTION_DETAIL
	Traceback (most recent call last):
	ParseError:
	>>> factory_getattr("global_form") # doctest: +IGNORE_EXCEPTION_DETAIL
	Traceback (most recent call last):
	ParseError:


	BeautifulSoup ParseErrors:

	XXX If I could come up with examples that break links and forms
	parsing, I'd uncomment these!

	>>> def create_soup(html):
	... r = test_html_response(html)
	... return MechanizeBs("latin-1", r.read())

	#>>> f = RobustLinksFactory()
	#>>> html = """\
	#... <a href="a">
	#... <frame src="b">
	#... <a href="c">
	#... <iframe src="d">
	#... </a>
	#... </area>
	#... </frame>
	#... """
	#>>> f.set_soup(create_soup(html), "http://example.com", "latin-1")
	#>>> list(f.links()) # doctest: +IGNORE_EXCEPTION_DETAIL
	#Traceback (most recent call last):
	#ParseError:

	#>>> html = """\
	#... <table>
	#... <tr><td>
	#... <input name='broken'>
	#... </td>
	#... </form>
	#... </tr>
	#... </form>
	#... """
	#>>> f = RobustFormsFactory()
	#>>> f.set_response(create_response(), "latin-1")
	#>>> list(f.forms()) # doctest: +IGNORE_EXCEPTION_DETAIL
	#Traceback (most recent call last):
	#ParseError:

	#>>> f = RobustTitleFactory()
	#>>> f.set_soup(create_soup(""), "latin-1")
	#>>> f.title() # doctest: +IGNORE_EXCEPTION_DETAIL
	#Traceback (most recent call last):
	#ParseError:



	Utility class for caching forms etc.

	>>> from mechanize._html import CachingGeneratorFunction

	>>> i = [1]
	>>> func = CachingGeneratorFunction(i)
	>>> list(func())
	[1]
	>>> list(func())
	[1]

	>>> i = [1, 2, 3]
	>>> func = CachingGeneratorFunction(i)
	>>> list(func())
	[1, 2, 3]

	>>> i = func()
	>>> i.next()
	1
	>>> i.next()
	2
	>>> i.next()
	3

	>>> i = func()
	>>> j = func()
	>>> i.next()
	1
	>>> j.next()
	1
	>>> i.next()
	2
	>>> j.next()
	2
	>>> j.next()
	3
	>>> i.next()
	3
	>>> i.next()
	Traceback (most recent call last):
	...
	StopIteration
	>>> j.next()
	Traceback (most recent call last):
	...
	StopIteration


	Link text parsing

	>>> def get_first_link_text_bs(html):
	... factory = RobustLinksFactory()
	... soup = MechanizeBs("utf-8", html)
	... factory.set_soup(soup, "http://example.com/", "utf-8")
	... return list(factory.links())[0].text

	>>> def get_first_link_text_sgmllib(html):
	... factory = LinksFactory()
	... response = test_html_response(html)
	... factory.set_response(response, "http://example.com/", "utf-8")
	... return list(factory.links())[0].text

	Whitespace gets compressed down to single spaces. Tags are removed.

	>>> html = ("""\
	... <html><head><title>Title</title></head><body>
	... <p><a href="http://example.com/">The quick\tbrown fox jumps
	... over the <i><b>lazy</b></i> dog </a>
	... </body></html>
	... """)
	>>> get_first_link_text_bs(html)
	'The quick brown fox jumps over the lazy dog'
	>>> get_first_link_text_sgmllib(html)
	'The quick brown fox jumps over the lazy dog'

	Empty <a> links have empty link text

	>>> html = ("""\
	... <html><head><title>Title</title></head><body>
	... <p><a href="http://example.com/"></a>
	... </body></html>
	... """)
	>>> get_first_link_text_bs(html)
	''
	>>> get_first_link_text_sgmllib(html)
	''

	But for backwards-compatibility, empty non-<a> links have None link text

	>>> html = ("""\
	... <html><head><title>Title</title></head><body>
	... <p><frame src="http://example.com/"></frame>
	... </body></html>
	... """)
	>>> print get_first_link_text_bs(html)
	None
	>>> print get_first_link_text_sgmllib(html)
	None


	Title parsing. We follow Firefox's behaviour with regard to child
	elements (haven't tested IE).

	>>> def get_title_bs(html):
	... factory = RobustTitleFactory()
	... soup = MechanizeBs("utf-8", html)
	... factory.set_soup(soup, "utf-8")
	... return factory.title()

	>>> def get_title_sgmllib(html):
	... factory = TitleFactory()
	... response = test_html_response(html)
	... factory.set_response(response, "utf-8")
	... return factory.title()

	>>> html = ("""\
	... <html><head>
	... <title>Title</title>
	... </head><body><p>Blah.<p></body></html>
	... """)
	>>> get_title_bs(html)
	'Title'
	>>> get_title_sgmllib(html)
	'Title'

	>>> html = ("""\
	... <html><head>
	... <title> Ti<script type="text/strange">alert("this is valid HTML -- yuck!")</script>
	... tle &&
	... </title>
	... </head><body><p>Blah.<p></body></html>
	... """)
	>>> get_title_bs(html)
	'Ti<script type="text/strange">alert("this is valid HTML -- yuck!")</script> tle &&'
	>>> get_title_sgmllib(html)
	'Ti<script type="text/strange">alert("this is valid HTML -- yuck!")</script> tle &&'


	No more tags after <title> used to cause an exception

	>>> html = ("""\
	... <html><head>
	... <title>""")
	>>> get_title_sgmllib(html)
	''