Migrate third_party/beautifulsoup4 to Python 3
Bug: chromium:1199459
Change-Id: Id07443fe2d1b00864bb9068345d6a50f5bef2c4e
Reviewed-on: https://chromium-review.googlesource.com/c/catapult/+/2886554
Auto-Submit: John Chen <johnchen@chromium.org>
Commit-Queue: Wenbin Zhang <wenbinzhang@google.com>
Reviewed-by: Wenbin Zhang <wenbinzhang@google.com>
diff --git a/third_party/beautifulsoup4/bs4/__init__.py b/third_party/beautifulsoup4/bs4/__init__.py
index 7ba3426..85d9135 100644
--- a/third_party/beautifulsoup4/bs4/__init__.py
+++ b/third_party/beautifulsoup4/bs4/__init__.py
@@ -16,6 +16,10 @@
http://www.crummy.com/software/BeautifulSoup/bs4/doc/
"""
+from __future__ import absolute_import
+from __future__ import print_function
+import six
+from six.moves import range
__author__ = "Leonard Richardson (leonardr@segfault.org)"
__version__ = "4.3.2"
__copyright__ = "Copyright (c) 2004-2013 Leonard Richardson"
@@ -135,12 +139,12 @@
"fromEncoding", "from_encoding")
if len(kwargs) > 0:
- arg = kwargs.keys().pop()
+ arg = list(kwargs.keys()).pop()
raise TypeError(
"__init__() got an unexpected keyword argument '%s'" % arg)
if builder is None:
- if isinstance(features, basestring):
+ if isinstance(features, six.string_types):
features = [features]
if features is None or len(features) == 0:
features = self.DEFAULT_BUILDER_FEATURES
@@ -164,7 +168,7 @@
# involving passing non-markup to Beautiful Soup.
# Beautiful Soup will still parse the input as markup,
# just in case that's what the user really wants.
- if (isinstance(markup, unicode)
+ if (isinstance(markup, six.text_type)
and not os.path.supports_unicode_filenames):
possible_filename = markup.encode("utf8")
else:
@@ -172,7 +176,7 @@
is_file = False
try:
is_file = os.path.exists(possible_filename)
- except Exception, e:
+ except Exception as e:
# This is almost certainly a problem involving
# characters not valid in filenames on this
# system. Just let it go.
@@ -184,7 +188,7 @@
# TODO: This is ugly but I couldn't get it to work in
# Python 3 otherwise.
if ((isinstance(markup, bytes) and not b' ' in markup)
- or (isinstance(markup, unicode) and not u' ' in markup)):
+ or (isinstance(markup, six.text_type) and not u' ' in markup)):
warnings.warn(
'"%s" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client to get the document behind the URL, and feed that document to Beautiful Soup.' % markup)
@@ -403,4 +407,4 @@
if __name__ == '__main__':
import sys
soup = BeautifulSoup(sys.stdin)
- print soup.prettify()
+ print(soup.prettify())
diff --git a/third_party/beautifulsoup4/bs4/builder/__init__.py b/third_party/beautifulsoup4/bs4/builder/__init__.py
index 740f5f2..819012d 100644
--- a/third_party/beautifulsoup4/bs4/builder/__init__.py
+++ b/third_party/beautifulsoup4/bs4/builder/__init__.py
@@ -1,3 +1,4 @@
+from __future__ import absolute_import
from collections import defaultdict
import itertools
import sys
@@ -6,6 +7,7 @@
ContentMetaAttributeValue,
whitespace_re
)
+import six
__all__ = [
'HTMLTreeBuilder',
@@ -159,7 +161,7 @@
# value is a whitespace-separated list of
# values. Split it into a list.
value = attrs[attr]
- if isinstance(value, basestring):
+ if isinstance(value, six.string_types):
values = whitespace_re.split(value)
else:
# html5lib sometimes calls setAttributes twice
diff --git a/third_party/beautifulsoup4/bs4/builder/_html5lib.py b/third_party/beautifulsoup4/bs4/builder/_html5lib.py
index 7de36ae..93d5381 100644
--- a/third_party/beautifulsoup4/bs4/builder/_html5lib.py
+++ b/third_party/beautifulsoup4/bs4/builder/_html5lib.py
@@ -1,3 +1,5 @@
+from __future__ import absolute_import
+import six
__all__ = [
'HTML5TreeBuilder',
]
@@ -37,7 +39,7 @@
doc = parser.parse(markup, encoding=self.user_specified_encoding)
# Set the character encoding detected by the tokenizer.
- if isinstance(markup, unicode):
+ if isinstance(markup, six.text_type):
# We need to special-case this because html5lib sets
# charEncoding to UTF-8 if it gets Unicode input.
doc.original_encoding = None
@@ -124,7 +126,7 @@
def appendChild(self, node):
string_child = child = None
- if isinstance(node, basestring):
+ if isinstance(node, six.string_types):
# Some other piece of code decided to pass in a string
# instead of creating a TextElement object to contain the
# string.
@@ -139,7 +141,7 @@
else:
child = node.element
- if not isinstance(child, basestring) and child.parent is not None:
+ if not isinstance(child, six.string_types) and child.parent is not None:
node.element.extract()
if (string_child and self.element.contents
@@ -152,7 +154,7 @@
old_element.replace_with(new_element)
self.soup._most_recent_element = new_element
else:
- if isinstance(node, basestring):
+ if isinstance(node, six.string_types):
# Create a brand new NavigableString from this string.
child = self.soup.new_string(node)
diff --git a/third_party/beautifulsoup4/bs4/builder/_htmlparser.py b/third_party/beautifulsoup4/bs4/builder/_htmlparser.py
index ca8d8b8..504a901 100644
--- a/third_party/beautifulsoup4/bs4/builder/_htmlparser.py
+++ b/third_party/beautifulsoup4/bs4/builder/_htmlparser.py
@@ -1,13 +1,18 @@
"""Use the HTMLParser library to parse HTML files that aren't too bad."""
+from __future__ import absolute_import
+from six import unichr
+import six
__all__ = [
'HTMLParserTreeBuilder',
]
-from HTMLParser import (
- HTMLParser,
- HTMLParseError,
- )
+from six.moves.html_parser import HTMLParser
+if six.PY2:
+ from six.moves.html_parser import HTMLParseError
+else:
+ class HTMLParseError(Exception):
+ pass
import sys
import warnings
@@ -73,7 +78,7 @@
try:
data = unichr(real_name)
- except (ValueError, OverflowError), e:
+ except (ValueError, OverflowError) as e:
data = u"\N{REPLACEMENT CHARACTER}"
self.handle_data(data)
@@ -142,7 +147,7 @@
declared within markup, whether any characters had to be
replaced with REPLACEMENT CHARACTER).
"""
- if isinstance(markup, unicode):
+ if isinstance(markup, six.text_type):
yield (markup, None, None, False)
return
@@ -158,7 +163,7 @@
parser.soup = self.soup
try:
parser.feed(markup)
- except HTMLParseError, e:
+ except HTMLParseError as e:
warnings.warn(RuntimeWarning(
"Python's built-in HTMLParser cannot parse the given document. This is not a bug in Beautiful Soup. The best solution is to install an external parser (lxml or html5lib), and use Beautiful Soup with that parser. See http://www.crummy.com/software/BeautifulSoup/bs4/doc/#installing-a-parser for help."))
raise e
diff --git a/third_party/beautifulsoup4/bs4/builder/_lxml.py b/third_party/beautifulsoup4/bs4/builder/_lxml.py
index fa5d498..7a5e612 100644
--- a/third_party/beautifulsoup4/bs4/builder/_lxml.py
+++ b/third_party/beautifulsoup4/bs4/builder/_lxml.py
@@ -1,3 +1,5 @@
+from __future__ import absolute_import
+import six
__all__ = [
'LXMLTreeBuilderForXML',
'LXMLTreeBuilder',
@@ -78,12 +80,12 @@
Each 4-tuple represents a strategy for parsing the document.
"""
- if isinstance(markup, unicode):
+ if isinstance(markup, six.text_type):
# We were given Unicode. Maybe lxml can parse Unicode on
# this system?
yield markup, None, document_declared_encoding, False
- if isinstance(markup, unicode):
+ if isinstance(markup, six.text_type):
# No, apparently not. Convert the Unicode to UTF-8 and
# tell lxml to parse it as UTF-8.
yield (markup.encode("utf8"), "utf8",
@@ -102,7 +104,7 @@
def feed(self, markup):
if isinstance(markup, bytes):
markup = BytesIO(markup)
- elif isinstance(markup, unicode):
+ elif isinstance(markup, six.text_type):
markup = StringIO(markup)
# Call feed() at least once, even if the markup is empty,
@@ -117,7 +119,7 @@
if len(data) != 0:
self.parser.feed(data)
self.parser.close()
- except (UnicodeDecodeError, LookupError, etree.ParserError), e:
+ except (UnicodeDecodeError, LookupError, etree.ParserError) as e:
raise ParserRejectedMarkup(str(e))
def close(self):
@@ -224,7 +226,7 @@
self.parser = self.parser_for(encoding)
self.parser.feed(markup)
self.parser.close()
- except (UnicodeDecodeError, LookupError, etree.ParserError), e:
+ except (UnicodeDecodeError, LookupError, etree.ParserError) as e:
raise ParserRejectedMarkup(str(e))
diff --git a/third_party/beautifulsoup4/bs4/dammit.py b/third_party/beautifulsoup4/bs4/dammit.py
index 59640b7..5fad727 100644
--- a/third_party/beautifulsoup4/bs4/dammit.py
+++ b/third_party/beautifulsoup4/bs4/dammit.py
@@ -7,11 +7,14 @@
XML or HTML to reflect a new encoding; that's the tree builder's job.
"""
+from __future__ import absolute_import
import codecs
-from htmlentitydefs import codepoint2name
+from six.moves.html_entities import codepoint2name
import re
import logging
import string
+from six import unichr
+import six
# Import a library to autodetect character encodings.
chardet_type = None
@@ -340,9 +343,9 @@
self.detector = EncodingDetector(markup, override_encodings, is_html)
# Short-circuit if the data is in Unicode to begin with.
- if isinstance(markup, unicode) or markup == '':
+ if isinstance(markup, six.text_type) or markup == '':
self.markup = markup
- self.unicode_markup = unicode(markup)
+ self.unicode_markup = six.text_type(markup)
self.original_encoding = None
return
@@ -425,7 +428,7 @@
def _to_unicode(self, data, encoding, errors="strict"):
'''Given a string and its encoding, decodes the string into Unicode.
%encoding is a string recognized by encodings.aliases'''
- return unicode(data, encoding, errors)
+ return six.text_type(data, encoding, errors)
@property
def declared_html_encoding(self):
diff --git a/third_party/beautifulsoup4/bs4/diagnose.py b/third_party/beautifulsoup4/bs4/diagnose.py
index 4d0b00a..ec417ab 100644
--- a/third_party/beautifulsoup4/bs4/diagnose.py
+++ b/third_party/beautifulsoup4/bs4/diagnose.py
@@ -1,7 +1,9 @@
"""Diagnostic functions, mainly for use when doing tech support."""
+from __future__ import absolute_import
+from __future__ import print_function
import cProfile
from StringIO import StringIO
-from HTMLParser import HTMLParser
+from six.moves.html_parser import HTMLParser
import bs4
from bs4 import BeautifulSoup, __version__
from bs4.builder import builder_registry
@@ -14,11 +16,13 @@
import traceback
import sys
import cProfile
+from six.moves import map
+from six.moves import range
def diagnose(data):
"""Diagnostic suite for isolating common problems."""
- print "Diagnostic running on Beautiful Soup %s" % __version__
- print "Python version %s" % sys.version
+ print("Diagnostic running on Beautiful Soup %s" % __version__)
+ print("Python version %s" % sys.version)
basic_parsers = ["html.parser", "html5lib", "lxml"]
for name in basic_parsers:
@@ -27,44 +31,44 @@
break
else:
basic_parsers.remove(name)
- print (
+ print((
"I noticed that %s is not installed. Installing it may help." %
- name)
+ name))
if 'lxml' in basic_parsers:
basic_parsers.append(["lxml", "xml"])
from lxml import etree
- print "Found lxml version %s" % ".".join(map(str,etree.LXML_VERSION))
+ print("Found lxml version %s" % ".".join(map(str,etree.LXML_VERSION)))
if 'html5lib' in basic_parsers:
import html5lib
- print "Found html5lib version %s" % html5lib.__version__
+ print("Found html5lib version %s" % html5lib.__version__)
if hasattr(data, 'read'):
data = data.read()
elif os.path.exists(data):
- print '"%s" looks like a filename. Reading data from the file.' % data
+ print('"%s" looks like a filename. Reading data from the file.' % data)
data = open(data).read()
elif data.startswith("http:") or data.startswith("https:"):
- print '"%s" looks like a URL. Beautiful Soup is not an HTTP client.' % data
- print "You need to use some other library to get the document behind the URL, and feed that document to Beautiful Soup."
+ print('"%s" looks like a URL. Beautiful Soup is not an HTTP client.' % data)
+ print("You need to use some other library to get the document behind the URL, and feed that document to Beautiful Soup.")
return
- print
+ print()
for parser in basic_parsers:
- print "Trying to parse your markup with %s" % parser
+ print("Trying to parse your markup with %s" % parser)
success = False
try:
soup = BeautifulSoup(data, parser)
success = True
- except Exception, e:
- print "%s could not parse the markup." % parser
+ except Exception as e:
+ print("%s could not parse the markup." % parser)
traceback.print_exc()
if success:
- print "Here's what %s did with the markup:" % parser
- print soup.prettify()
+ print("Here's what %s did with the markup:" % parser)
+ print(soup.prettify())
- print "-" * 80
+ print("-" * 80)
def lxml_trace(data, html=True, **kwargs):
"""Print out the lxml events that occur during parsing.
@@ -74,7 +78,7 @@
"""
from lxml import etree
for event, element in etree.iterparse(StringIO(data), html=html, **kwargs):
- print("%s, %4s, %s" % (event, element.tag, element.text))
+ print(("%s, %4s, %s" % (event, element.tag, element.text)))
class AnnouncingParser(HTMLParser):
"""Announces HTMLParser parse events, without doing anything else."""
@@ -156,9 +160,9 @@
def benchmark_parsers(num_elements=100000):
"""Very basic head-to-head performance benchmark."""
- print "Comparative parser benchmark on Beautiful Soup %s" % __version__
+ print("Comparative parser benchmark on Beautiful Soup %s" % __version__)
data = rdoc(num_elements)
- print "Generated a large invalid HTML document (%d bytes)." % len(data)
+ print("Generated a large invalid HTML document (%d bytes)." % len(data))
for parser in ["lxml", ["lxml", "html"], "html5lib", "html.parser"]:
success = False
@@ -167,24 +171,24 @@
soup = BeautifulSoup(data, parser)
b = time.time()
success = True
- except Exception, e:
- print "%s could not parse the markup." % parser
+ except Exception as e:
+ print("%s could not parse the markup." % parser)
traceback.print_exc()
if success:
- print "BS4+%s parsed the markup in %.2fs." % (parser, b-a)
+ print("BS4+%s parsed the markup in %.2fs." % (parser, b-a))
from lxml import etree
a = time.time()
etree.HTML(data)
b = time.time()
- print "Raw lxml parsed the markup in %.2fs." % (b-a)
+ print("Raw lxml parsed the markup in %.2fs." % (b-a))
import html5lib
parser = html5lib.HTMLParser()
a = time.time()
parser.parse(data)
b = time.time()
- print "Raw html5lib parsed the markup in %.2fs." % (b-a)
+ print("Raw html5lib parsed the markup in %.2fs." % (b-a))
def profile(num_elements=100000, parser="lxml"):
diff --git a/third_party/beautifulsoup4/bs4/element.py b/third_party/beautifulsoup4/bs4/element.py
index da9afdf..bda27c3 100644
--- a/third_party/beautifulsoup4/bs4/element.py
+++ b/third_party/beautifulsoup4/bs4/element.py
@@ -1,8 +1,11 @@
+from __future__ import absolute_import
+from __future__ import print_function
import collections
import re
import sys
import warnings
from bs4.dammit import EntitySubstitution
+import six
DEFAULT_OUTPUT_ENCODING = "utf-8"
PY3K = (sys.version_info[0] > 2)
@@ -21,22 +24,22 @@
return alias
-class NamespacedAttribute(unicode):
+class NamespacedAttribute(six.text_type):
def __new__(cls, prefix, name, namespace=None):
if name is None:
- obj = unicode.__new__(cls, prefix)
+ obj = six.text_type.__new__(cls, prefix)
elif prefix is None:
# Not really namespaced.
- obj = unicode.__new__(cls, name)
+ obj = six.text_type.__new__(cls, name)
else:
- obj = unicode.__new__(cls, prefix + ":" + name)
+ obj = six.text_type.__new__(cls, prefix + ":" + name)
obj.prefix = prefix
obj.name = name
obj.namespace = namespace
return obj
-class AttributeValueWithCharsetSubstitution(unicode):
+class AttributeValueWithCharsetSubstitution(six.text_type):
"""A stand-in object for a character encoding specified in HTML."""
class CharsetMetaAttributeValue(AttributeValueWithCharsetSubstitution):
@@ -47,7 +50,7 @@
"""
def __new__(cls, original_value):
- obj = unicode.__new__(cls, original_value)
+ obj = six.text_type.__new__(cls, original_value)
obj.original_value = original_value
return obj
@@ -70,9 +73,9 @@
match = cls.CHARSET_RE.search(original_value)
if match is None:
# No substitution necessary.
- return unicode.__new__(unicode, original_value)
+ return six.text_type.__new__(six.text_type, original_value)
- obj = unicode.__new__(cls, original_value)
+ obj = six.text_type.__new__(cls, original_value)
obj.original_value = original_value
return obj
@@ -272,7 +275,7 @@
def insert(self, position, new_child):
if new_child is self:
raise ValueError("Cannot insert a tag into itself.")
- if (isinstance(new_child, basestring)
+ if (isinstance(new_child, six.string_types)
and not isinstance(new_child, NavigableString)):
new_child = NavigableString(new_child)
@@ -489,7 +492,7 @@
result = (element for element in generator
if isinstance(element, Tag))
return ResultSet(strainer, result)
- elif isinstance(name, basestring):
+ elif isinstance(name, six.string_types):
# Optimization to find all tags with a given name.
result = (element for element in generator
if isinstance(element, Tag)
@@ -640,7 +643,7 @@
return self.parents
-class NavigableString(unicode, PageElement):
+class NavigableString(six.text_type, PageElement):
PREFIX = ''
SUFFIX = ''
@@ -653,15 +656,15 @@
passed in to the superclass's __new__ or the superclass won't know
how to handle non-ASCII characters.
"""
- if isinstance(value, unicode):
- return unicode.__new__(cls, value)
- return unicode.__new__(cls, value, DEFAULT_OUTPUT_ENCODING)
+ if isinstance(value, six.text_type):
+ return six.text_type.__new__(cls, value)
+ return six.text_type.__new__(cls, value, DEFAULT_OUTPUT_ENCODING)
def __copy__(self):
return self
def __getnewargs__(self):
- return (unicode(self),)
+ return (six.text_type(self),)
def __getattr__(self, attr):
"""text.string gives you text. This is for backwards
@@ -1025,8 +1028,8 @@
else:
if isinstance(val, list) or isinstance(val, tuple):
val = ' '.join(val)
- elif not isinstance(val, basestring):
- val = unicode(val)
+ elif not isinstance(val, six.string_types):
+ val = six.text_type(val)
elif (
isinstance(val, AttributeValueWithCharsetSubstitution)
and eventual_encoding is not None):
@@ -1034,7 +1037,7 @@
text = self.format_string(val, formatter)
decoded = (
- unicode(key) + '='
+ six.text_type(key) + '='
+ EntitySubstitution.quoted_attribute_value(text))
attrs.append(decoded)
close = ''
@@ -1210,16 +1213,16 @@
raise ValueError(
'Final combinator "%s" is missing an argument.' % tokens[-1])
if self._select_debug:
- print 'Running CSS selector "%s"' % selector
+ print('Running CSS selector "%s"' % selector)
for index, token in enumerate(tokens):
if self._select_debug:
- print ' Considering token "%s"' % token
+ print(' Considering token "%s"' % token)
recursive_candidate_generator = None
tag_name = None
if tokens[index-1] in self._selector_combinators:
# This token was consumed by the previous combinator. Skip it.
if self._select_debug:
- print ' Token was consumed by the previous combinator.'
+ print(' Token was consumed by the previous combinator.')
continue
# Each operation corresponds to a checker function, a rule
# for determining whether a candidate matches the
@@ -1325,14 +1328,14 @@
next_token = tokens[index+1]
def recursive_select(tag):
if self._select_debug:
- print ' Calling select("%s") recursively on %s %s' % (next_token, tag.name, tag.attrs)
- print '-' * 40
+ print(' Calling select("%s") recursively on %s %s' % (next_token, tag.name, tag.attrs))
+ print('-' * 40)
for i in tag.select(next_token, recursive_candidate_generator):
if self._select_debug:
- print '(Recursive select picked up candidate %s %s)' % (i.name, i.attrs)
+ print('(Recursive select picked up candidate %s %s)' % (i.name, i.attrs))
yield i
if self._select_debug:
- print '-' * 40
+ print('-' * 40)
_use_candidate_generator = recursive_select
elif _candidate_generator is None:
# By default, a tag's candidates are all of its
@@ -1343,7 +1346,7 @@
check = "[any]"
else:
check = tag_name
- print ' Default candidate generator, tag name="%s"' % check
+ print(' Default candidate generator, tag name="%s"' % check)
if self._select_debug:
# This is redundant with later code, but it stops
# a bunch of bogus tags from cluttering up the
@@ -1365,8 +1368,8 @@
new_context_ids = set([])
for tag in current_context:
if self._select_debug:
- print " Running candidate generator on %s %s" % (
- tag.name, repr(tag.attrs))
+ print(" Running candidate generator on %s %s" % (
+ tag.name, repr(tag.attrs)))
for candidate in _use_candidate_generator(tag):
if not isinstance(candidate, Tag):
continue
@@ -1381,21 +1384,21 @@
break
if checker is None or result:
if self._select_debug:
- print " SUCCESS %s %s" % (candidate.name, repr(candidate.attrs))
+ print(" SUCCESS %s %s" % (candidate.name, repr(candidate.attrs)))
if id(candidate) not in new_context_ids:
# If a tag matches a selector more than once,
# don't include it in the context more than once.
new_context.append(candidate)
new_context_ids.add(id(candidate))
elif self._select_debug:
- print " FAILURE %s %s" % (candidate.name, repr(candidate.attrs))
+ print(" FAILURE %s %s" % (candidate.name, repr(candidate.attrs)))
current_context = new_context
if self._select_debug:
- print "Final verdict:"
+ print("Final verdict:")
for i in current_context:
- print " %s %s" % (i.name, i.attrs)
+ print(" %s %s" % (i.name, i.attrs))
return current_context
# Old names for backwards compatibility
@@ -1448,7 +1451,7 @@
def _normalize_search_value(self, value):
# Leave it alone if it's a Unicode string, a callable, a
# regular expression, a boolean, or None.
- if (isinstance(value, unicode) or callable(value) or hasattr(value, 'match')
+ if (isinstance(value, six.text_type) or callable(value) or hasattr(value, 'match')
or isinstance(value, bool) or value is None):
return value
@@ -1461,7 +1464,7 @@
new_value = []
for v in value:
if (hasattr(v, '__iter__') and not isinstance(v, bytes)
- and not isinstance(v, unicode)):
+ and not isinstance(v, six.text_type)):
# This is almost certainly the user's mistake. In the
# interests of avoiding infinite loops, we'll let
# it through as-is rather than doing a recursive call.
@@ -1473,7 +1476,7 @@
# Otherwise, convert it into a Unicode string.
# The unicode(str()) thing is so this will do the same thing on Python 2
# and Python 3.
- return unicode(str(value))
+ return six.text_type(str(value))
def __str__(self):
if self.text:
@@ -1527,7 +1530,7 @@
found = None
# If given a list of items, scan it for a text element that
# matches.
- if hasattr(markup, '__iter__') and not isinstance(markup, (Tag, basestring)):
+ if hasattr(markup, '__iter__') and not isinstance(markup, (Tag, six.string_types)):
for element in markup:
if isinstance(element, NavigableString) \
and self.search(element):
@@ -1540,7 +1543,7 @@
found = self.search_tag(markup)
# If it's text, make sure the text matches.
elif isinstance(markup, NavigableString) or \
- isinstance(markup, basestring):
+ isinstance(markup, six.string_types):
if not self.name and not self.attrs and self._matches(markup, self.text):
found = markup
else:
@@ -1554,7 +1557,7 @@
if isinstance(markup, list) or isinstance(markup, tuple):
# This should only happen when searching a multi-valued attribute
# like 'class'.
- if (isinstance(match_against, unicode)
+ if (isinstance(match_against, six.text_type)
and ' ' in match_against):
# A bit of a special case. If they try to match "foo
# bar" on a multivalue attribute's value, only accept
@@ -1589,7 +1592,7 @@
# None matches None, False, an empty string, an empty list, and so on.
return not match_against
- if isinstance(match_against, unicode):
+ if isinstance(match_against, six.text_type):
# Exact string match
return markup == match_against
diff --git a/third_party/beautifulsoup4/bs4/testing.py b/third_party/beautifulsoup4/bs4/testing.py
index fd4495a..4d94c39 100644
--- a/third_party/beautifulsoup4/bs4/testing.py
+++ b/third_party/beautifulsoup4/bs4/testing.py
@@ -1,5 +1,6 @@
"""Helper classes for tests."""
+from __future__ import absolute_import
import copy
import functools
import unittest
@@ -14,6 +15,7 @@
)
from bs4.builder import HTMLParserTreeBuilder
+import six
default_builder = HTMLParserTreeBuilder
@@ -501,7 +503,7 @@
markup = '<rss xmlns:dc="foo"><dc:creator>b</dc:creator><dc:date>2012-07-02T20:33:42Z</dc:date><dc:rights>c</dc:rights><image>d</image></rss>'
soup = self.soup(markup)
self.assertEqual(
- unicode(soup.rss), markup)
+ six.text_type(soup.rss), markup)
def test_docstring_includes_correct_encoding(self):
soup = self.soup("<root/>")
@@ -532,17 +534,17 @@
def test_closing_namespaced_tag(self):
markup = '<p xmlns:dc="http://purl.org/dc/elements/1.1/"><dc:date>20010504</dc:date></p>'
soup = self.soup(markup)
- self.assertEqual(unicode(soup.p), markup)
+ self.assertEqual(six.text_type(soup.p), markup)
def test_namespaced_attributes(self):
markup = '<foo xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"><bar xsi:schemaLocation="http://www.example.com"/></foo>'
soup = self.soup(markup)
- self.assertEqual(unicode(soup.foo), markup)
+ self.assertEqual(six.text_type(soup.foo), markup)
def test_namespaced_attributes_xml_namespace(self):
markup = '<foo xml:lang="fr">bar</foo>'
soup = self.soup(markup)
- self.assertEqual(unicode(soup.foo), markup)
+ self.assertEqual(six.text_type(soup.foo), markup)
class HTML5TreeBuilderSmokeTest(HTMLTreeBuilderSmokeTest):
"""Smoke test for a tree builder that supports HTML5."""
diff --git a/third_party/beautifulsoup4/bs4/tests/test_builder_registry.py b/third_party/beautifulsoup4/bs4/tests/test_builder_registry.py
index 92ad10f..7c84fa3 100644
--- a/third_party/beautifulsoup4/bs4/tests/test_builder_registry.py
+++ b/third_party/beautifulsoup4/bs4/tests/test_builder_registry.py
@@ -1,5 +1,6 @@
"""Tests of the builder registry."""
+from __future__ import absolute_import
import unittest
from bs4 import BeautifulSoup
diff --git a/third_party/beautifulsoup4/bs4/tests/test_docs.py b/third_party/beautifulsoup4/bs4/tests/test_docs.py
index 5b9f677..01eb94e 100644
--- a/third_party/beautifulsoup4/bs4/tests/test_docs.py
+++ b/third_party/beautifulsoup4/bs4/tests/test_docs.py
@@ -2,6 +2,7 @@
# pylint: disable-msg=E0611,W0142
+from __future__ import absolute_import
__metaclass__ = type
__all__ = [
'additional_tests',
diff --git a/third_party/beautifulsoup4/bs4/tests/test_html5lib.py b/third_party/beautifulsoup4/bs4/tests/test_html5lib.py
index 594c3e1..31a0659 100644
--- a/third_party/beautifulsoup4/bs4/tests/test_html5lib.py
+++ b/third_party/beautifulsoup4/bs4/tests/test_html5lib.py
@@ -1,11 +1,12 @@
"""Tests to ensure that the html5lib tree builder generates good trees."""
+from __future__ import absolute_import
import warnings
try:
from bs4.builder import HTML5TreeBuilder
HTML5LIB_PRESENT = True
-except ImportError, e:
+except ImportError as e:
HTML5LIB_PRESENT = False
from bs4.element import SoupStrainer
from bs4.testing import (
diff --git a/third_party/beautifulsoup4/bs4/tests/test_htmlparser.py b/third_party/beautifulsoup4/bs4/tests/test_htmlparser.py
index bcb5ed2..9eb4bed 100644
--- a/third_party/beautifulsoup4/bs4/tests/test_htmlparser.py
+++ b/third_party/beautifulsoup4/bs4/tests/test_htmlparser.py
@@ -1,6 +1,7 @@
"""Tests to ensure that the html.parser tree builder generates good
trees."""
+from __future__ import absolute_import
from bs4.testing import SoupTest, HTMLTreeBuilderSmokeTest
from bs4.builder import HTMLParserTreeBuilder
diff --git a/third_party/beautifulsoup4/bs4/tests/test_lxml.py b/third_party/beautifulsoup4/bs4/tests/test_lxml.py
index 2b2e9b7..20ba5ee 100644
--- a/third_party/beautifulsoup4/bs4/tests/test_lxml.py
+++ b/third_party/beautifulsoup4/bs4/tests/test_lxml.py
@@ -1,13 +1,15 @@
"""Tests to ensure that the lxml tree builder generates good trees."""
+from __future__ import absolute_import
import re
import warnings
+import six
try:
import lxml.etree
LXML_PRESENT = True
LXML_VERSION = lxml.etree.LXML_VERSION
-except ImportError, e:
+except ImportError as e:
LXML_PRESENT = False
LXML_VERSION = (0,)
@@ -62,7 +64,7 @@
# if one is installed.
with warnings.catch_warnings(record=True) as w:
soup = BeautifulStoneSoup("<b />")
- self.assertEqual(u"<b/>", unicode(soup.b))
+ self.assertEqual(u"<b/>", six.text_type(soup.b))
self.assertTrue("BeautifulStoneSoup class is deprecated" in str(w[0].message))
def test_real_xhtml_document(self):
diff --git a/third_party/beautifulsoup4/bs4/tests/test_soup.py b/third_party/beautifulsoup4/bs4/tests/test_soup.py
index 47ac245..0b42e4f 100644
--- a/third_party/beautifulsoup4/bs4/tests/test_soup.py
+++ b/third_party/beautifulsoup4/bs4/tests/test_soup.py
@@ -1,6 +1,7 @@
# -*- coding: utf-8 -*-
"""Tests of Beautiful Soup as a whole."""
+from __future__ import absolute_import
import logging
import unittest
import sys
@@ -26,11 +27,12 @@
skipIf,
)
import warnings
+import six
try:
from bs4.builder import LXMLTreeBuilder, LXMLTreeBuilderForXML
LXML_PRESENT = True
-except ImportError, e:
+except ImportError as e:
LXML_PRESENT = False
PYTHON_2_PRE_2_7 = (sys.version_info < (2,7))
@@ -204,7 +206,7 @@
ascii = b"<foo>a</foo>"
soup_from_ascii = self.soup(ascii)
unicode_output = soup_from_ascii.decode()
- self.assertTrue(isinstance(unicode_output, unicode))
+ self.assertTrue(isinstance(unicode_output, six.text_type))
self.assertEqual(unicode_output, self.document_for(ascii.decode()))
self.assertEqual(soup_from_ascii.original_encoding.lower(), "utf-8")
finally:
diff --git a/third_party/beautifulsoup4/bs4/tests/test_tree.py b/third_party/beautifulsoup4/bs4/tests/test_tree.py
index f8515c0..edb2f51 100644
--- a/third_party/beautifulsoup4/bs4/tests/test_tree.py
+++ b/third_party/beautifulsoup4/bs4/tests/test_tree.py
@@ -9,6 +9,7 @@
methods tested here.
"""
+from __future__ import absolute_import
import copy
import pickle
import re
@@ -30,6 +31,7 @@
SoupTest,
skipIf,
)
+import six
XML_BUILDER_PRESENT = (builder_registry.lookup("xml") is not None)
LXML_PRESENT = (builder_registry.lookup("lxml") is not None)
@@ -1393,7 +1395,7 @@
def test_prettify_outputs_unicode_by_default(self):
soup = self.soup("<a></a>")
- self.assertEqual(unicode, type(soup.prettify()))
+ self.assertEqual(six.text_type, type(soup.prettify()))
def test_prettify_can_encode_data(self):
soup = self.soup("<a></a>")
diff --git a/third_party/beautifulsoup4/doc/source/conf.py b/third_party/beautifulsoup4/doc/source/conf.py
index 102c3cf..4bad2e9 100644
--- a/third_party/beautifulsoup4/doc/source/conf.py
+++ b/third_party/beautifulsoup4/doc/source/conf.py
@@ -11,6 +11,7 @@
# All configuration values have a default; values that are commented out
# serve to show the default.
+from __future__ import absolute_import
import sys, os
# If extensions (or modules to document with autodoc) are in another directory,
diff --git a/third_party/beautifulsoup4/scripts/demonstrate_parser_differences.py b/third_party/beautifulsoup4/scripts/demonstrate_parser_differences.py
index d84670a..c62f06d 100644
--- a/third_party/beautifulsoup4/scripts/demonstrate_parser_differences.py
+++ b/third_party/beautifulsoup4/scripts/demonstrate_parser_differences.py
@@ -14,6 +14,8 @@
your document the way it does.
"""
+from __future__ import absolute_import
+from __future__ import print_function
import os
import sys
from bs4 import BeautifulSoup
@@ -22,13 +24,13 @@
try:
from bs4.builder import _lxml
parsers.append('lxml')
-except ImportError, e:
+except ImportError as e:
pass
try:
from bs4.builder import _html5lib
parsers.append('html5lib')
-except ImportError, e:
+except ImportError as e:
pass
class Demonstration(object):
@@ -47,7 +49,7 @@
output = soup.div
else:
output = soup
- except Exception, e:
+ except Exception as e:
output = "[EXCEPTION] %s" % str(e)
self.results[parser] = output
if previous_output is None:
@@ -57,15 +59,15 @@
return uniform_results
def dump(self):
- print "%s: %s" % ("Markup".rjust(13), self.markup.encode("utf8"))
+ print("%s: %s" % ("Markup".rjust(13), self.markup.encode("utf8")))
for parser, output in self.results.items():
- print "%s: %s" % (parser.rjust(13), output.encode("utf8"))
+ print("%s: %s" % (parser.rjust(13), output.encode("utf8")))
different_results = []
uniform_results = []
-print "= Testing the following parsers: %s =" % ", ".join(parsers)
-print
+print("= Testing the following parsers: %s =" % ", ".join(parsers))
+print()
input_file = sys.stdin
if sys.stdin.isatty():
@@ -83,13 +85,13 @@
else:
different_results.append(demo)
-print "== Markup that's handled the same in every parser =="
-print
+print("== Markup that's handled the same in every parser ==")
+print()
for demo in uniform_results:
demo.dump()
- print
-print "== Markup that's not handled the same in every parser =="
-print
+ print()
+print("== Markup that's not handled the same in every parser ==")
+print()
for demo in different_results:
demo.dump()
- print
+ print()
diff --git a/third_party/beautifulsoup4/setup.py b/third_party/beautifulsoup4/setup.py
index 0142ea0..c895096 100644
--- a/third_party/beautifulsoup4/setup.py
+++ b/third_party/beautifulsoup4/setup.py
@@ -1,3 +1,4 @@
+from __future__ import absolute_import
from distutils.core import setup
try: