| >>> import mechanize |
| >>> from mechanize._response import test_html_response |
| >>> from mechanize._html import LinksFactory, FormsFactory, TitleFactory, \ |
| ... MechanizeBs, \ |
| ... RobustLinksFactory, RobustFormsFactory, RobustTitleFactory |
| |
| mechanize.ParseError should be raised on parsing erroneous HTML. |
| |
| For backwards compatibility, mechanize.ParseError derives from |
| exception classes that mechanize used to raise, prior to version |
| 0.1.6. |
| |
| >>> import sgmllib |
| >>> import HTMLParser |
| >>> issubclass(mechanize.ParseError, sgmllib.SGMLParseError) |
| True |
| >>> issubclass(mechanize.ParseError, HTMLParser.HTMLParseError) |
| True |
| |
| >>> def create_response(error=True): |
| ... extra = "" |
| ... if error: |
| ... extra = "<!!!>" |
| ... html = """\ |
| ... <html> |
| ... <head> |
| ... <title>Title</title> |
| ... %s |
| ... </head> |
| ... <body> |
| ... <p>Hello world |
| ... </body> |
| ... </html> |
| ... """ % extra |
| ... return test_html_response(html) |
| |
| >>> f = LinksFactory() |
| >>> f.set_response(create_response(), "http://example.com", "latin-1") |
| >>> list(f.links()) # doctest: +IGNORE_EXCEPTION_DETAIL |
| Traceback (most recent call last): |
| ParseError: |
| >>> f = FormsFactory() |
| >>> f.set_response(create_response(), "latin-1") |
| >>> list(f.forms()) # doctest: +IGNORE_EXCEPTION_DETAIL |
| Traceback (most recent call last): |
| ParseError: |
| >>> f = TitleFactory() |
| >>> f.set_response(create_response(), "latin-1") |
| >>> f.title() # doctest: +IGNORE_EXCEPTION_DETAIL |
| Traceback (most recent call last): |
| ParseError: |
| |
| |
| Accessing attributes on Factory may also raise ParseError |
| |
| >>> def factory_getattr(attr_name): |
| ... fact = mechanize.DefaultFactory() |
| ... fact.set_response(create_response()) |
| ... getattr(fact, attr_name) |
| >>> factory_getattr("title") # doctest: +IGNORE_EXCEPTION_DETAIL |
| Traceback (most recent call last): |
| ParseError: |
| >>> factory_getattr("global_form") # doctest: +IGNORE_EXCEPTION_DETAIL |
| Traceback (most recent call last): |
| ParseError: |
| |
| |
| BeautifulSoup ParseErrors: |
| |
| XXX If I could come up with examples that break links and forms |
| parsing, I'd uncomment these! |
| |
| >>> def create_soup(html): |
| ... r = test_html_response(html) |
| ... return MechanizeBs("latin-1", r.read()) |
| |
| #>>> f = RobustLinksFactory() |
| #>>> html = """\ |
| #... <a href="a"> |
| #... <frame src="b"> |
| #... <a href="c"> |
| #... <iframe src="d"> |
| #... </a> |
| #... </area> |
| #... </frame> |
| #... """ |
| #>>> f.set_soup(create_soup(html), "http://example.com", "latin-1") |
| #>>> list(f.links()) # doctest: +IGNORE_EXCEPTION_DETAIL |
| #Traceback (most recent call last): |
| #ParseError: |
| |
| #>>> html = """\ |
| #... <table> |
| #... <tr><td> |
| #... <input name='broken'> |
| #... </td> |
| #... </form> |
| #... </tr> |
| #... </form> |
| #... """ |
| #>>> f = RobustFormsFactory() |
| #>>> f.set_response(create_response(), "latin-1") |
| #>>> list(f.forms()) # doctest: +IGNORE_EXCEPTION_DETAIL |
| #Traceback (most recent call last): |
| #ParseError: |
| |
| #>>> f = RobustTitleFactory() |
| #>>> f.set_soup(create_soup(""), "latin-1") |
| #>>> f.title() # doctest: +IGNORE_EXCEPTION_DETAIL |
| #Traceback (most recent call last): |
| #ParseError: |
| |
| |
| |
| Utility class for caching forms etc. |
| |
| >>> from mechanize._html import CachingGeneratorFunction |
| |
| >>> i = [1] |
| >>> func = CachingGeneratorFunction(i) |
| >>> list(func()) |
| [1] |
| >>> list(func()) |
| [1] |
| |
| >>> i = [1, 2, 3] |
| >>> func = CachingGeneratorFunction(i) |
| >>> list(func()) |
| [1, 2, 3] |
| |
| >>> i = func() |
| >>> i.next() |
| 1 |
| >>> i.next() |
| 2 |
| >>> i.next() |
| 3 |
| |
| >>> i = func() |
| >>> j = func() |
| >>> i.next() |
| 1 |
| >>> j.next() |
| 1 |
| >>> i.next() |
| 2 |
| >>> j.next() |
| 2 |
| >>> j.next() |
| 3 |
| >>> i.next() |
| 3 |
| >>> i.next() |
| Traceback (most recent call last): |
| ... |
| StopIteration |
| >>> j.next() |
| Traceback (most recent call last): |
| ... |
| StopIteration |
| |
| |
| Link text parsing |
| |
| >>> def get_first_link_text_bs(html): |
| ... factory = RobustLinksFactory() |
| ... soup = MechanizeBs("utf-8", html) |
| ... factory.set_soup(soup, "http://example.com/", "utf-8") |
| ... return list(factory.links())[0].text |
| |
| >>> def get_first_link_text_sgmllib(html): |
| ... factory = LinksFactory() |
| ... response = test_html_response(html) |
| ... factory.set_response(response, "http://example.com/", "utf-8") |
| ... return list(factory.links())[0].text |
| |
| Whitespace gets compressed down to single spaces. Tags are removed. |
| |
| >>> html = ("""\ |
| ... <html><head><title>Title</title></head><body> |
| ... <p><a href="http://example.com/">The quick\tbrown fox jumps |
| ... over the <i><b>lazy</b></i> dog </a> |
| ... </body></html> |
| ... """) |
| >>> get_first_link_text_bs(html) |
| 'The quick brown fox jumps over the lazy dog' |
| >>> get_first_link_text_sgmllib(html) |
| 'The quick brown fox jumps over the lazy dog' |
| |
| Empty <a> links have empty link text |
| |
| >>> html = ("""\ |
| ... <html><head><title>Title</title></head><body> |
| ... <p><a href="http://example.com/"></a> |
| ... </body></html> |
| ... """) |
| >>> get_first_link_text_bs(html) |
| '' |
| >>> get_first_link_text_sgmllib(html) |
| '' |
| |
| But for backwards-compatibility, empty non-<a> links have None link text |
| |
| >>> html = ("""\ |
| ... <html><head><title>Title</title></head><body> |
| ... <p><frame src="http://example.com/"></frame> |
| ... </body></html> |
| ... """) |
| >>> print get_first_link_text_bs(html) |
| None |
| >>> print get_first_link_text_sgmllib(html) |
| None |
| |
| |
| Title parsing. We follow Firefox's behaviour with regard to child |
| elements (haven't tested IE). |
| |
| >>> def get_title_bs(html): |
| ... factory = RobustTitleFactory() |
| ... soup = MechanizeBs("utf-8", html) |
| ... factory.set_soup(soup, "utf-8") |
| ... return factory.title() |
| |
| >>> def get_title_sgmllib(html): |
| ... factory = TitleFactory() |
| ... response = test_html_response(html) |
| ... factory.set_response(response, "utf-8") |
| ... return factory.title() |
| |
| >>> html = ("""\ |
| ... <html><head> |
| ... <title>Title</title> |
| ... </head><body><p>Blah.<p></body></html> |
| ... """) |
| >>> get_title_bs(html) |
| 'Title' |
| >>> get_title_sgmllib(html) |
| 'Title' |
| |
| >>> html = ("""\ |
| ... <html><head> |
| ... <title> Ti<script type="text/strange">alert("this is valid HTML -- yuck!")</script> |
| ... tle && |
| ... </title> |
| ... </head><body><p>Blah.<p></body></html> |
| ... """) |
| >>> get_title_bs(html) |
| 'Ti<script type="text/strange">alert("this is valid HTML -- yuck!")</script> tle &&' |
| >>> get_title_sgmllib(html) |
| 'Ti<script type="text/strange">alert("this is valid HTML -- yuck!")</script> tle &&' |
| |
| |
| No more tags after <title> used to cause an exception |
| |
| >>> html = ("""\ |
| ... <html><head> |
| ... <title>""") |
| >>> get_title_sgmllib(html) |
| '' |