Fixed a bug that made the HTMLParser treebuilder generate XML definitions ending with two question marks instead of one. [bug=984258]

author: Leonard Richardson <leonard.richardson@canonical.com> 2012-04-18 08:45:51 -0400
committer: Leonard Richardson <leonard.richardson@canonical.com> 2012-04-18 08:45:51 -0400
commit: 4500ef64c28cc968679e772763882a2f4ee7cfd9 (patch)
tree: d869fab0a3e65f3c939c8caf68b5ebea0d02378a
parent: ada20cd3b9ba283ac6c327d963df2cc546e7d46b (diff)
download: beautifulsoup4-4500ef64c28cc968679e772763882a2f4ee7cfd9.tar.gz
7 files changed, 63 insertions, 10 deletions
diff --git a/NEWS.txt b/NEWS.txt
index bca8022..61d975f 100644
--- a/NEWS.txt
+++ b/NEWS.txt
@@ -1,3 +1,9 @@
+= 4.0.5 (unreleased) =
+
+* Fixed a bug that made the HTMLParser treebuilder generate XML
+  definitions ending with two question marks instead of
+  one. [bug=984258]
+
 = 4.0.4 (20120416) =
 
 * Fixed a bug that sometimes created disconnected trees.
diff --git a/TODO.txt b/TODO.txt
index 8112a5e..c9f9baa 100644
--- a/TODO.txt
+++ b/TODO.txt
@@ -1,3 +1,8 @@
+Additions
+---------
+
+More of the jQuery API: wrap, unwrap (i.e. replace_with_children), nextUntil?
+
 Optimizations
 -------------
 
diff --git a/bs4/builder/_htmlparser.py b/bs4/builder/_htmlparser.py
index c307ff8..3dee51b 100644
--- a/bs4/builder/_htmlparser.py
+++ b/bs4/builder/_htmlparser.py
@@ -96,6 +96,14 @@ class BeautifulSoupHTMLParser(HTMLParser):
 
     def handle_pi(self, data):
         self.soup.endData()
+        if data.endswith("?") and data.lower().startswith("xml"):
+            # "An XHTML processing instruction using the trailing '?'
+            # will cause the '?' to be included in data." - HTMLParser
+            # docs.
+            #
+            # Strip the question mark so we don't end up with two
+            # question marks.
+            data = data[:-1]
         self.soup.handle_data(data)
         self.soup.endData(ProcessingInstruction)
 
diff --git a/bs4/testing.py b/bs4/testing.py
index e9c505c..41c8783 100644
--- a/bs4/testing.py
+++ b/bs4/testing.py
@@ -94,6 +94,19 @@ class HTMLTreeBuilderSmokeTest(object):
         # Test a namespaced doctype with a public id.
         self.assertDoctypeHandled('xsl:stylesheet PUBLIC "htmlent.dtd"')
 
+    def test_real_xhtml_document(self):
+        """A real XHTML document should come out more or less the same as it went in."""
+        markup = b"""<?xml version="1.0" encoding="utf-8"?>
+<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN">
+<html xmlns="http://www.w3.org/1999/xhtml">
+<head><title>Hello.</title></head>
+<body>Goodbye.</body>
+</html>"""
+        soup = self.soup(markup)
+        self.assertEqual(
+            soup.encode("utf-8").replace(b"\n", b""),
+            markup.replace(b"\n", b""))
+
     def test_deepcopy(self):
         """Make sure you can copy the tree builder.
 
@@ -393,14 +406,8 @@ class XMLTreeBuilderSmokeTest(object):
         self.assertEqual(
             soup.encode(), b'<?xml version="1.0" encoding="utf-8"?>\n<root/>')
 
-    def test_docstring_includes_correct_encoding(self):
-        soup = self.soup("<root/>")
-        self.assertEqual(
-            soup.encode("latin1"),
-            b'<?xml version="1.0" encoding="latin1"?>\n<root/>')
-
     def test_real_xhtml_document(self):
-        """A real XHTML document should come out the same as it went in."""
+        """A real XHTML document should come out *exactly* the same as it went in."""
         markup = b"""<?xml version="1.0" encoding="utf-8"?>
 <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN">
 <html xmlns="http://www.w3.org/1999/xhtml">
@@ -408,7 +415,15 @@ class XMLTreeBuilderSmokeTest(object):
 <body>Goodbye.</body>
 </html>"""
         soup = self.soup(markup)
-        self.assertEqual(soup.encode("utf-8"), markup)
+        self.assertEqual(
+            soup.encode("utf-8"), markup)
+
+
+    def test_docstring_includes_correct_encoding(self):
+        soup = self.soup("<root/>")
+        self.assertEqual(
+            soup.encode("latin1"),
+            b'<?xml version="1.0" encoding="latin1"?>\n<root/>')
 
     def test_large_xml_document(self):
         """A large XML document should come out the same as it went in."""
@@ -434,6 +449,11 @@ class XMLTreeBuilderSmokeTest(object):
 class HTML5TreeBuilderSmokeTest(HTMLTreeBuilderSmokeTest):
     """Smoke test for a tree builder that supports HTML5."""
 
+    def test_real_xhtml_document(self):
+        # Since XHTML is not HTML5, HTML5 parsers are not tested to handle
+        # XHTML documents in any particular way.
+        pass
+
     def test_html_tags_have_namespace(self):
         markup = "<a>"
         soup = self.soup(markup)
diff --git a/bs4/tests/test_htmlparser.py b/bs4/tests/test_htmlparser.py
index 6215185..bcb5ed2 100644
--- a/bs4/tests/test_htmlparser.py
+++ b/bs4/tests/test_htmlparser.py
@@ -17,4 +17,3 @@ class HTMLParserTreeBuilderSmokeTest(SoupTest, HTMLTreeBuilderSmokeTest):
     def test_namespaced_public_doctype(self):
         # html.parser can't handle namespaced doctypes, so skip this one.
         pass
-
diff --git a/bs4/tests/test_lxml.py b/bs4/tests/test_lxml.py
index 4e0b12e..39e26bf 100644
--- a/bs4/tests/test_lxml.py
+++ b/bs4/tests/test_lxml.py
@@ -48,6 +48,21 @@ class LXMLTreeBuilderSmokeTest(SoupTest, HTMLTreeBuilderSmokeTest):
             soup = BeautifulStoneSoup("<b />")
             self.assertEqual(u"<b/>", unicode(soup.b))
 
+    def test_real_xhtml_document(self):
+        """lxml strips the XML definition from an XHTML doc, which is fine."""
+        markup = b"""<?xml version="1.0" encoding="utf-8"?>
+<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN">
+<html xmlns="http://www.w3.org/1999/xhtml">
+<head><title>Hello.</title></head>
+<body>Goodbye.</body>
+</html>"""
+        soup = self.soup(markup)
+        self.assertEqual(
+            soup.encode("utf-8").replace(b"\n", b''),
+            markup.replace(b'\n', b'').replace(
+                b'<?xml version="1.0" encoding="utf-8"?>', b''))
+
+
 @skipIf(
     not LXML_PRESENT,
     "lxml seems not to be present, not testing its XML tree builder.")
diff --git a/doc/source/index.rst b/doc/source/index.rst
index 5abc597..5aab90e 100644
--- a/doc/source/index.rst
+++ b/doc/source/index.rst
@@ -2183,7 +2183,7 @@ that the document is given an XML declaration instead of being put
 into an <html> tag.::
 
  BeautifulSoup("<a><b /></a>", "xml")
- # <?xml version="1.0" encoding="utf-8">
+ # <?xml version="1.0" encoding="utf-8"?>
  # <a><b /></a>
 
 There are also differences between HTML parsers. If you give Beautiful
author	Leonard Richardson <leonard.richardson@canonical.com>	2012-04-18 08:45:51 -0400
committer	Leonard Richardson <leonard.richardson@canonical.com>	2012-04-18 08:45:51 -0400
commit	4500ef64c28cc968679e772763882a2f4ee7cfd9 (patch)
tree	d869fab0a3e65f3c939c8caf68b5ebea0d02378a
parent	ada20cd3b9ba283ac6c327d963df2cc546e7d46b (diff)
download	beautifulsoup4-4500ef64c28cc968679e772763882a2f4ee7cfd9.tar.gz