blob: cd76943582157f24916a8ac404ec603d68bad6c9 [file] [log] [blame]
>>> import mechanize
>>> from mechanize._response import test_html_response
>>> from mechanize._html import LinksFactory, FormsFactory, TitleFactory, \
... MechanizeBs, \
... RobustLinksFactory, RobustFormsFactory, RobustTitleFactory
mechanize.ParseError should be raised on parsing erroneous HTML.
For backwards compatibility, mechanize.ParseError derives from
exception classes that mechanize used to raise, prior to version
0.1.6.
>>> import sgmllib
>>> import HTMLParser
>>> issubclass(mechanize.ParseError, sgmllib.SGMLParseError)
True
>>> issubclass(mechanize.ParseError, HTMLParser.HTMLParseError)
True
>>> def create_response(error=True):
... extra = ""
... if error:
... extra = "<!!!>"
... html = """\
... <html>
... <head>
... <title>Title</title>
... %s
... </head>
... <body>
... <p>Hello world
... </body>
... </html>
... """ % extra
... return test_html_response(html)
>>> f = LinksFactory()
>>> f.set_response(create_response(), "http://example.com", "latin-1")
>>> list(f.links()) # doctest: +IGNORE_EXCEPTION_DETAIL
Traceback (most recent call last):
ParseError:
>>> f = FormsFactory()
>>> f.set_response(create_response(), "latin-1")
>>> list(f.forms()) # doctest: +IGNORE_EXCEPTION_DETAIL
Traceback (most recent call last):
ParseError:
>>> f = TitleFactory()
>>> f.set_response(create_response(), "latin-1")
>>> f.title() # doctest: +IGNORE_EXCEPTION_DETAIL
Traceback (most recent call last):
ParseError:
Accessing attributes on Factory may also raise ParseError
>>> def factory_getattr(attr_name):
... fact = mechanize.DefaultFactory()
... fact.set_response(create_response())
... getattr(fact, attr_name)
>>> factory_getattr("title") # doctest: +IGNORE_EXCEPTION_DETAIL
Traceback (most recent call last):
ParseError:
>>> factory_getattr("global_form") # doctest: +IGNORE_EXCEPTION_DETAIL
Traceback (most recent call last):
ParseError:
BeautifulSoup ParseErrors:
XXX If I could come up with examples that break links and forms
parsing, I'd uncomment these!
>>> def create_soup(html):
... r = test_html_response(html)
... return MechanizeBs("latin-1", r.read())
#>>> f = RobustLinksFactory()
#>>> html = """\
#... <a href="a">
#... <frame src="b">
#... <a href="c">
#... <iframe src="d">
#... </a>
#... </area>
#... </frame>
#... """
#>>> f.set_soup(create_soup(html), "http://example.com", "latin-1")
#>>> list(f.links()) # doctest: +IGNORE_EXCEPTION_DETAIL
#Traceback (most recent call last):
#ParseError:
#>>> html = """\
#... <table>
#... <tr><td>
#... <input name='broken'>
#... </td>
#... </form>
#... </tr>
#... </form>
#... """
#>>> f = RobustFormsFactory()
#>>> f.set_response(create_response(), "latin-1")
#>>> list(f.forms()) # doctest: +IGNORE_EXCEPTION_DETAIL
#Traceback (most recent call last):
#ParseError:
#>>> f = RobustTitleFactory()
#>>> f.set_soup(create_soup(""), "latin-1")
#>>> f.title() # doctest: +IGNORE_EXCEPTION_DETAIL
#Traceback (most recent call last):
#ParseError:
Utility class for caching forms etc.
>>> from mechanize._html import CachingGeneratorFunction
>>> i = [1]
>>> func = CachingGeneratorFunction(i)
>>> list(func())
[1]
>>> list(func())
[1]
>>> i = [1, 2, 3]
>>> func = CachingGeneratorFunction(i)
>>> list(func())
[1, 2, 3]
>>> i = func()
>>> i.next()
1
>>> i.next()
2
>>> i.next()
3
>>> i = func()
>>> j = func()
>>> i.next()
1
>>> j.next()
1
>>> i.next()
2
>>> j.next()
2
>>> j.next()
3
>>> i.next()
3
>>> i.next()
Traceback (most recent call last):
...
StopIteration
>>> j.next()
Traceback (most recent call last):
...
StopIteration
Link text parsing
>>> def get_first_link_text_bs(html):
... factory = RobustLinksFactory()
... soup = MechanizeBs("utf-8", html)
... factory.set_soup(soup, "http://example.com/", "utf-8")
... return list(factory.links())[0].text
>>> def get_first_link_text_sgmllib(html):
... factory = LinksFactory()
... response = test_html_response(html)
... factory.set_response(response, "http://example.com/", "utf-8")
... return list(factory.links())[0].text
Whitespace gets compressed down to single spaces. Tags are removed.
>>> html = ("""\
... <html><head><title>Title</title></head><body>
... <p><a href="http://example.com/">The quick\tbrown fox jumps
... over the <i><b>lazy</b></i> dog </a>
... </body></html>
... """)
>>> get_first_link_text_bs(html)
'The quick brown fox jumps over the lazy dog'
>>> get_first_link_text_sgmllib(html)
'The quick brown fox jumps over the lazy dog'
Empty <a> links have empty link text
>>> html = ("""\
... <html><head><title>Title</title></head><body>
... <p><a href="http://example.com/"></a>
... </body></html>
... """)
>>> get_first_link_text_bs(html)
''
>>> get_first_link_text_sgmllib(html)
''
But for backwards-compatibility, empty non-<a> links have None link text
>>> html = ("""\
... <html><head><title>Title</title></head><body>
... <p><frame src="http://example.com/"></frame>
... </body></html>
... """)
>>> print get_first_link_text_bs(html)
None
>>> print get_first_link_text_sgmllib(html)
None
Title parsing. We follow Firefox's behaviour with regard to child
elements (haven't tested IE).
>>> def get_title_bs(html):
... factory = RobustTitleFactory()
... soup = MechanizeBs("utf-8", html)
... factory.set_soup(soup, "utf-8")
... return factory.title()
>>> def get_title_sgmllib(html):
... factory = TitleFactory()
... response = test_html_response(html)
... factory.set_response(response, "utf-8")
... return factory.title()
>>> html = ("""\
... <html><head>
... <title>Title</title>
... </head><body><p>Blah.<p></body></html>
... """)
>>> get_title_bs(html)
'Title'
>>> get_title_sgmllib(html)
'Title'
>>> html = ("""\
... <html><head>
... <title> Ti<script type="text/strange">alert("this is valid HTML -- yuck!")</script>
... tle &amp;&#38;
... </title>
... </head><body><p>Blah.<p></body></html>
... """)
>>> get_title_bs(html)
'Ti<script type="text/strange">alert("this is valid HTML -- yuck!")</script> tle &&'
>>> get_title_sgmllib(html)
'Ti<script type="text/strange">alert("this is valid HTML -- yuck!")</script> tle &&'
No more tags after <title> used to cause an exception
>>> html = ("""\
... <html><head>
... <title>""")
>>> get_title_sgmllib(html)
''