diff options
author | Leonard Richardson <leonardr@segfault.org> | 2014-12-11 22:23:26 -0500 |
---|---|---|
committer | Leonard Richardson <leonardr@segfault.org> | 2014-12-11 22:23:26 -0500 |
commit | 29f97ad45bfd4a0d1a6b5359b4c10ac9e26a87d5 (patch) | |
tree | 5999e990051702ce838a39dbfc0bda0aa018b9a7 | |
parent | 056b9348f66969013c9e48026de69d249f3a101c (diff) | |
download | beautifulsoup4-29f97ad45bfd4a0d1a6b5359b4c10ac9e26a87d5.tar.gz |
Improved the lxml tree builder's handling of processing
instructions. [bug=1294645]
-rw-r--r-- | NEWS.txt | 3 | ||||
-rw-r--r-- | bs4/builder/_htmlparser.py | 8 | ||||
-rw-r--r-- | bs4/builder/_lxml.py | 11 | ||||
-rw-r--r-- | bs4/element.py | 2 | ||||
-rw-r--r-- | bs4/testing.py | 5 | ||||
-rw-r--r-- | bs4/tests/test_html5lib.py | 6 | ||||
-rw-r--r-- | bs4/tests/test_lxml.py | 15 |
7 files changed, 24 insertions, 26 deletions
@@ -11,6 +11,9 @@ * The select() method can now find tags whose names contain dashes. Patch by Francisco Canas [bug=1276211] +* Improved the lxml tree builder's handling of processing + instructions. [bug=1294645] + * Restored the helpful syntax error that happens when you try to import the Python 2 edition of Beautiful Soup under Python 3. [bug=1213387] diff --git a/bs4/builder/_htmlparser.py b/bs4/builder/_htmlparser.py index bf231f1..7f3ae73 100644 --- a/bs4/builder/_htmlparser.py +++ b/bs4/builder/_htmlparser.py @@ -112,14 +112,6 @@ class BeautifulSoupHTMLParser(HTMLParser): def handle_pi(self, data): self.soup.endData() - if data.endswith("?") and data.lower().startswith("xml"): - # "An XHTML processing instruction using the trailing '?' - # will cause the '?' to be included in data." - HTMLParser - # docs. - # - # Strip the question mark so we don't end up with two - # question marks. - data = data[:-1] self.soup.handle_data(data) self.soup.endData(ProcessingInstruction) diff --git a/bs4/builder/_lxml.py b/bs4/builder/_lxml.py index 978c8df..b0bc8a0 100644 --- a/bs4/builder/_lxml.py +++ b/bs4/builder/_lxml.py @@ -7,7 +7,12 @@ from io import BytesIO from StringIO import StringIO import collections from lxml import etree -from bs4.element import Comment, Doctype, NamespacedAttribute +from bs4.element import ( + Comment, + Doctype, + NamespacedAttribute, + ProcessingInstruction, +) from bs4.builder import ( FAST, HTML, @@ -191,7 +196,9 @@ class LXMLTreeBuilderForXML(TreeBuilder): self.nsmaps.pop() def pi(self, target, data): - pass + self.soup.endData() + self.soup.handle_data(target + ' ' + data) + self.soup.endData(ProcessingInstruction) def data(self, content): self.soup.handle_data(content) diff --git a/bs4/element.py b/bs4/element.py index 1127c7a..ff716df 100644 --- a/bs4/element.py +++ b/bs4/element.py @@ -707,7 +707,7 @@ class CData(PreformattedString): class ProcessingInstruction(PreformattedString): PREFIX = u'<?' - SUFFIX = u'?>' + SUFFIX = u'>' class Comment(PreformattedString): diff --git a/bs4/testing.py b/bs4/testing.py index 3e700f3..023a495 100644 --- a/bs4/testing.py +++ b/bs4/testing.py @@ -114,6 +114,11 @@ class HTMLTreeBuilderSmokeTest(object): soup.encode("utf-8").replace(b"\n", b""), markup.replace(b"\n", b"")) + def test_processing_instruction(self): + markup = b"""<?PITarget PIContent?>""" + soup = self.soup(markup) + self.assertEqual(markup, soup.encode("utf8")) + def test_deepcopy(self): """Make sure you can copy the tree builder. diff --git a/bs4/tests/test_html5lib.py b/bs4/tests/test_html5lib.py index 594c3e1..9a2bacf 100644 --- a/bs4/tests/test_html5lib.py +++ b/bs4/tests/test_html5lib.py @@ -83,3 +83,9 @@ class HTML5LibBuilderSmokeTest(SoupTest, HTML5TreeBuilderSmokeTest): soup = self.soup(markup) self.assertEqual(u"<body><p><em>foo</em></p><em>\n</em><p><em>bar<a></a></em></p>\n</body>", soup.body.decode()) self.assertEqual(2, len(soup.find_all('p'))) + + def test_processing_instruction(self): + """Processing instructions become comments.""" + markup = b"""<?PITarget PIContent?>""" + soup = self.soup(markup) + assert str(soup).startswith("<!--?PITarget PIContent?-->") diff --git a/bs4/tests/test_lxml.py b/bs4/tests/test_lxml.py index 2b2e9b7..a05870b 100644 --- a/bs4/tests/test_lxml.py +++ b/bs4/tests/test_lxml.py @@ -65,21 +65,6 @@ class LXMLTreeBuilderSmokeTest(SoupTest, HTMLTreeBuilderSmokeTest): self.assertEqual(u"<b/>", unicode(soup.b)) self.assertTrue("BeautifulStoneSoup class is deprecated" in str(w[0].message)) - def test_real_xhtml_document(self): - """lxml strips the XML definition from an XHTML doc, which is fine.""" - markup = b"""<?xml version="1.0" encoding="utf-8"?> -<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"> -<html xmlns="http://www.w3.org/1999/xhtml"> -<head><title>Hello.</title></head> -<body>Goodbye.</body> -</html>""" - soup = self.soup(markup) - self.assertEqual( - soup.encode("utf-8").replace(b"\n", b''), - markup.replace(b'\n', b'').replace( - b'<?xml version="1.0" encoding="utf-8"?>', b'')) - - @skipIf( not LXML_PRESENT, "lxml seems not to be present, not testing its XML tree builder.") |