Fixed a bug in which short Unicode input was improperly encoded to ASCII when checking whether or not it was a file on
disk. [bug=1227016]
diff --git a/NEWS.txt b/NEWS.txt
index 5d0e751..6df956a 100644
--- a/NEWS.txt
+++ b/NEWS.txt
@@ -1,5 +1,9 @@
= 4.3.2 (Unreleased) =
+* Fixed a bug in which short Unicode input was improperly encoded to
+ ASCII when checking whether or not it was a file on
+ disk. [bug=1227016]
+
* Combined two tests to stop a spurious test failure when tests are
run by nosetests. [bug=1212445]
diff --git a/bs4/__init__.py b/bs4/__init__.py
index 272d44a..6d44c95 100644
--- a/bs4/__init__.py
+++ b/bs4/__init__.py
@@ -164,7 +164,11 @@
# involving passing non-markup to Beautiful Soup.
# Beautiful Soup will still parse the input as markup,
# just in case that's what the user really wants.
- if os.path.exists(markup):
+ if isinstance(markup, unicode):
+ possible_filename = markup.encode("utf8")
+ else:
+ possible_filename = markup
+ if os.path.exists(possible_filename):
warnings.warn(
'"%s" looks like a filename, not markup. You should probably open this file and pass the filehandle into Beautiful Soup.' % markup)
if markup[:5] == "http:" or markup[:6] == "https:":
diff --git a/bs4/builder/_htmlparser.py b/bs4/builder/_htmlparser.py
index 4b80f79..ca8d8b8 100644
--- a/bs4/builder/_htmlparser.py
+++ b/bs4/builder/_htmlparser.py
@@ -45,7 +45,15 @@
class BeautifulSoupHTMLParser(HTMLParser):
def handle_starttag(self, name, attrs):
# XXX namespace
- self.soup.handle_starttag(name, None, None, dict(attrs))
+ attr_dict = {}
+ for key, value in attrs:
+ # Change None attribute values to the empty string
+ # for consistency with the other tree builders.
+ if value is None:
+ value = ''
+ attr_dict[key] = value
+ attrvalue = '""'
+ self.soup.handle_starttag(name, None, None, attr_dict)
def handle_endtag(self, name):
self.soup.handle_endtag(name)
diff --git a/bs4/tests/test_soup.py b/bs4/tests/test_soup.py
index de93513..79a2bc5 100644
--- a/bs4/tests/test_soup.py
+++ b/bs4/tests/test_soup.py
@@ -36,6 +36,13 @@
PYTHON_2_PRE_2_7 = (sys.version_info < (2,7))
PYTHON_3_PRE_3_2 = (sys.version_info[0] == 3 and sys.version_info < (3,2))
+class TestConstructor(SoupTest):
+
+ def test_short_unicode_input(self):
+ data = u"<h1>éé</h1>"
+ soup = self.soup(data)
+ self.assertEqual(u"éé", soup.h1.string)
+
class TestDeprecatedConstructorArguments(SoupTest):
def test_parseOnlyThese_renamed_to_parse_only(self):
diff --git a/doc/source/index.rst b/doc/source/index.rst
index f7ee5f9..0e5f6d1 100644
--- a/doc/source/index.rst
+++ b/doc/source/index.rst
@@ -1213,8 +1213,8 @@
You can filter an attribute based on `a string`_, `a regular
expression`_, `a list`_, `a function`_, or `the value True`_.
-This code finds all tags that have an ``id`` attribute, regardless of
-what the value is::
+This code finds all tags whose ``id`` attribute has a value,
+regardless of what the value is::
soup.find_all(id=True)
# [<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,