Fixed a bug in which short Unicode input was improperly encoded to ASCII when checking whether or not it was a file on disk. [bug=1227016]

commit: fc4722736b4c776f48188a3d6380601d091fa1aa [log] [tgz]
author: Leonard Richardson <leonardr@segfault.org> Wed Oct 02 01:55:22 2013
committer: Leonard Richardson <leonardr@segfault.org> Wed Oct 02 01:55:22 2013
tree: c74ebb0855c969a36faad28030d6b2fc33d77451
parent: 9c4e8dbf8fe46a57bcffa895ab94c58ab6cc53ed [diff]
diff --git a/NEWS.txt b/NEWS.txt
index 5d0e751..6df956a 100644
--- a/NEWS.txt
+++ b/NEWS.txt

@@ -1,5 +1,9 @@
 = 4.3.2 (Unreleased) =
 
+* Fixed a bug in which short Unicode input was improperly encoded to
+  ASCII when checking whether or not it was a file on
+  disk. [bug=1227016]
+
 * Combined two tests to stop a spurious test failure when tests are
   run by nosetests. [bug=1212445]
 

diff --git a/bs4/__init__.py b/bs4/__init__.py
index 272d44a..6d44c95 100644
--- a/bs4/__init__.py
+++ b/bs4/__init__.py

@@ -164,7 +164,11 @@
             # involving passing non-markup to Beautiful Soup.
             # Beautiful Soup will still parse the input as markup,
             # just in case that's what the user really wants.
-            if os.path.exists(markup):
+            if isinstance(markup, unicode):
+                possible_filename = markup.encode("utf8")
+            else:
+                possible_filename = markup
+            if os.path.exists(possible_filename):
                 warnings.warn(
                     '"%s" looks like a filename, not markup. You should probably open this file and pass the filehandle into Beautiful Soup.' % markup)
             if markup[:5] == "http:" or markup[:6] == "https:":

diff --git a/bs4/builder/_htmlparser.py b/bs4/builder/_htmlparser.py
index 4b80f79..ca8d8b8 100644
--- a/bs4/builder/_htmlparser.py
+++ b/bs4/builder/_htmlparser.py

@@ -45,7 +45,15 @@
 class BeautifulSoupHTMLParser(HTMLParser):
     def handle_starttag(self, name, attrs):
         # XXX namespace
-        self.soup.handle_starttag(name, None, None, dict(attrs))
+        attr_dict = {}
+        for key, value in attrs:
+            # Change None attribute values to the empty string
+            # for consistency with the other tree builders.
+            if value is None:
+                value = ''
+            attr_dict[key] = value
+            attrvalue = '""'
+        self.soup.handle_starttag(name, None, None, attr_dict)
 
     def handle_endtag(self, name):
         self.soup.handle_endtag(name)

diff --git a/bs4/tests/test_soup.py b/bs4/tests/test_soup.py
index de93513..79a2bc5 100644
--- a/bs4/tests/test_soup.py
+++ b/bs4/tests/test_soup.py

@@ -36,6 +36,13 @@
 PYTHON_2_PRE_2_7 = (sys.version_info < (2,7))
 PYTHON_3_PRE_3_2 = (sys.version_info[0] == 3 and sys.version_info < (3,2))
 
+class TestConstructor(SoupTest):
+
+    def test_short_unicode_input(self):
+        data = u"<h1>éé</h1>"
+        soup = self.soup(data)
+        self.assertEqual(u"éé", soup.h1.string)
+
 class TestDeprecatedConstructorArguments(SoupTest):
 
     def test_parseOnlyThese_renamed_to_parse_only(self):

diff --git a/doc/source/index.rst b/doc/source/index.rst
index f7ee5f9..0e5f6d1 100644
--- a/doc/source/index.rst
+++ b/doc/source/index.rst

@@ -1213,8 +1213,8 @@
 You can filter an attribute based on `a string`_, `a regular
 expression`_, `a list`_, `a function`_, or `the value True`_.
 
-This code finds all tags that have an ``id`` attribute, regardless of
-what the value is::
+This code finds all tags whose ``id`` attribute has a value,
+regardless of what the value is::
 
  soup.find_all(id=True)
  # [<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
commit	fc4722736b4c776f48188a3d6380601d091fa1aa	[log] [tgz]
author	Leonard Richardson <leonardr@segfault.org>	Wed Oct 02 01:55:22 2013
committer	Leonard Richardson <leonardr@segfault.org>	Wed Oct 02 01:55:22 2013
tree	c74ebb0855c969a36faad28030d6b2fc33d77451
parent	9c4e8dbf8fe46a57bcffa895ab94c58ab6cc53ed [diff]