diff options
author | Leonard Richardson <leonard.richardson@canonical.com> | 2012-04-18 08:45:51 -0400 |
---|---|---|
committer | Leonard Richardson <leonard.richardson@canonical.com> | 2012-04-18 08:45:51 -0400 |
commit | 4500ef64c28cc968679e772763882a2f4ee7cfd9 (patch) | |
tree | d869fab0a3e65f3c939c8caf68b5ebea0d02378a | |
parent | ada20cd3b9ba283ac6c327d963df2cc546e7d46b (diff) | |
download | beautifulsoup4-4500ef64c28cc968679e772763882a2f4ee7cfd9.tar.gz |
Fixed a bug that made the HTMLParser treebuilder generate XML definitions ending with two question marks instead of one. [bug=984258]
-rw-r--r-- | NEWS.txt | 6 | ||||
-rw-r--r-- | TODO.txt | 5 | ||||
-rw-r--r-- | bs4/builder/_htmlparser.py | 8 | ||||
-rw-r--r-- | bs4/testing.py | 36 | ||||
-rw-r--r-- | bs4/tests/test_htmlparser.py | 1 | ||||
-rw-r--r-- | bs4/tests/test_lxml.py | 15 | ||||
-rw-r--r-- | doc/source/index.rst | 2 |
7 files changed, 63 insertions, 10 deletions
@@ -1,3 +1,9 @@ += 4.0.5 (unreleased) = + +* Fixed a bug that made the HTMLParser treebuilder generate XML + definitions ending with two question marks instead of + one. [bug=984258] + = 4.0.4 (20120416) = * Fixed a bug that sometimes created disconnected trees. @@ -1,3 +1,8 @@ +Additions +--------- + +More of the jQuery API: wrap, unwrap (i.e. replace_with_children), nextUntil? + Optimizations ------------- diff --git a/bs4/builder/_htmlparser.py b/bs4/builder/_htmlparser.py index c307ff8..3dee51b 100644 --- a/bs4/builder/_htmlparser.py +++ b/bs4/builder/_htmlparser.py @@ -96,6 +96,14 @@ class BeautifulSoupHTMLParser(HTMLParser): def handle_pi(self, data): self.soup.endData() + if data.endswith("?") and data.lower().startswith("xml"): + # "An XHTML processing instruction using the trailing '?' + # will cause the '?' to be included in data." - HTMLParser + # docs. + # + # Strip the question mark so we don't end up with two + # question marks. + data = data[:-1] self.soup.handle_data(data) self.soup.endData(ProcessingInstruction) diff --git a/bs4/testing.py b/bs4/testing.py index e9c505c..41c8783 100644 --- a/bs4/testing.py +++ b/bs4/testing.py @@ -94,6 +94,19 @@ class HTMLTreeBuilderSmokeTest(object): # Test a namespaced doctype with a public id. self.assertDoctypeHandled('xsl:stylesheet PUBLIC "htmlent.dtd"') + def test_real_xhtml_document(self): + """A real XHTML document should come out more or less the same as it went in.""" + markup = b"""<?xml version="1.0" encoding="utf-8"?> +<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"> +<html xmlns="http://www.w3.org/1999/xhtml"> +<head><title>Hello.</title></head> +<body>Goodbye.</body> +</html>""" + soup = self.soup(markup) + self.assertEqual( + soup.encode("utf-8").replace(b"\n", b""), + markup.replace(b"\n", b"")) + def test_deepcopy(self): """Make sure you can copy the tree builder. @@ -393,14 +406,8 @@ class XMLTreeBuilderSmokeTest(object): self.assertEqual( soup.encode(), b'<?xml version="1.0" encoding="utf-8"?>\n<root/>') - def test_docstring_includes_correct_encoding(self): - soup = self.soup("<root/>") - self.assertEqual( - soup.encode("latin1"), - b'<?xml version="1.0" encoding="latin1"?>\n<root/>') - def test_real_xhtml_document(self): - """A real XHTML document should come out the same as it went in.""" + """A real XHTML document should come out *exactly* the same as it went in.""" markup = b"""<?xml version="1.0" encoding="utf-8"?> <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"> <html xmlns="http://www.w3.org/1999/xhtml"> @@ -408,7 +415,15 @@ class XMLTreeBuilderSmokeTest(object): <body>Goodbye.</body> </html>""" soup = self.soup(markup) - self.assertEqual(soup.encode("utf-8"), markup) + self.assertEqual( + soup.encode("utf-8"), markup) + + + def test_docstring_includes_correct_encoding(self): + soup = self.soup("<root/>") + self.assertEqual( + soup.encode("latin1"), + b'<?xml version="1.0" encoding="latin1"?>\n<root/>') def test_large_xml_document(self): """A large XML document should come out the same as it went in.""" @@ -434,6 +449,11 @@ class XMLTreeBuilderSmokeTest(object): class HTML5TreeBuilderSmokeTest(HTMLTreeBuilderSmokeTest): """Smoke test for a tree builder that supports HTML5.""" + def test_real_xhtml_document(self): + # Since XHTML is not HTML5, HTML5 parsers are not tested to handle + # XHTML documents in any particular way. + pass + def test_html_tags_have_namespace(self): markup = "<a>" soup = self.soup(markup) diff --git a/bs4/tests/test_htmlparser.py b/bs4/tests/test_htmlparser.py index 6215185..bcb5ed2 100644 --- a/bs4/tests/test_htmlparser.py +++ b/bs4/tests/test_htmlparser.py @@ -17,4 +17,3 @@ class HTMLParserTreeBuilderSmokeTest(SoupTest, HTMLTreeBuilderSmokeTest): def test_namespaced_public_doctype(self): # html.parser can't handle namespaced doctypes, so skip this one. pass - diff --git a/bs4/tests/test_lxml.py b/bs4/tests/test_lxml.py index 4e0b12e..39e26bf 100644 --- a/bs4/tests/test_lxml.py +++ b/bs4/tests/test_lxml.py @@ -48,6 +48,21 @@ class LXMLTreeBuilderSmokeTest(SoupTest, HTMLTreeBuilderSmokeTest): soup = BeautifulStoneSoup("<b />") self.assertEqual(u"<b/>", unicode(soup.b)) + def test_real_xhtml_document(self): + """lxml strips the XML definition from an XHTML doc, which is fine.""" + markup = b"""<?xml version="1.0" encoding="utf-8"?> +<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"> +<html xmlns="http://www.w3.org/1999/xhtml"> +<head><title>Hello.</title></head> +<body>Goodbye.</body> +</html>""" + soup = self.soup(markup) + self.assertEqual( + soup.encode("utf-8").replace(b"\n", b''), + markup.replace(b'\n', b'').replace( + b'<?xml version="1.0" encoding="utf-8"?>', b'')) + + @skipIf( not LXML_PRESENT, "lxml seems not to be present, not testing its XML tree builder.") diff --git a/doc/source/index.rst b/doc/source/index.rst index 5abc597..5aab90e 100644 --- a/doc/source/index.rst +++ b/doc/source/index.rst @@ -2183,7 +2183,7 @@ that the document is given an XML declaration instead of being put into an <html> tag.:: BeautifulSoup("<a><b /></a>", "xml") - # <?xml version="1.0" encoding="utf-8"> + # <?xml version="1.0" encoding="utf-8"?> # <a><b /></a> There are also differences between HTML parsers. If you give Beautiful |