Improved the lxml tree builder's handling of processing

instructions. [bug=1294645]
author: Leonard Richardson <leonardr@segfault.org> 2014-12-11 22:23:26 -0500
committer: Leonard Richardson <leonardr@segfault.org> 2014-12-11 22:23:26 -0500
commit: 29f97ad45bfd4a0d1a6b5359b4c10ac9e26a87d5 (patch)
tree: 5999e990051702ce838a39dbfc0bda0aa018b9a7
parent: 056b9348f66969013c9e48026de69d249f3a101c (diff)
download: beautifulsoup4-29f97ad45bfd4a0d1a6b5359b4c10ac9e26a87d5.tar.gz
7 files changed, 24 insertions, 26 deletions
diff --git a/NEWS.txt b/NEWS.txt
index d45e1c4..df2abef 100644
--- a/NEWS.txt
+++ b/NEWS.txt
@@ -11,6 +11,9 @@
 * The select() method can now find tags whose names contain
   dashes. Patch by Francisco Canas [bug=1276211]
 
+* Improved the lxml tree builder's handling of processing
+  instructions. [bug=1294645]
+
 * Restored the helpful syntax error that happens when you try to
   import the Python 2 edition of Beautiful Soup under Python
   3. [bug=1213387]
diff --git a/bs4/builder/_htmlparser.py b/bs4/builder/_htmlparser.py
index bf231f1..7f3ae73 100644
--- a/bs4/builder/_htmlparser.py
+++ b/bs4/builder/_htmlparser.py
@@ -112,14 +112,6 @@ class BeautifulSoupHTMLParser(HTMLParser):
 
     def handle_pi(self, data):
         self.soup.endData()
-        if data.endswith("?") and data.lower().startswith("xml"):
-            # "An XHTML processing instruction using the trailing '?'
-            # will cause the '?' to be included in data." - HTMLParser
-            # docs.
-            #
-            # Strip the question mark so we don't end up with two
-            # question marks.
-            data = data[:-1]
         self.soup.handle_data(data)
         self.soup.endData(ProcessingInstruction)
 
diff --git a/bs4/builder/_lxml.py b/bs4/builder/_lxml.py
index 978c8df..b0bc8a0 100644
--- a/bs4/builder/_lxml.py
+++ b/bs4/builder/_lxml.py
@@ -7,7 +7,12 @@ from io import BytesIO
 from StringIO import StringIO
 import collections
 from lxml import etree
-from bs4.element import Comment, Doctype, NamespacedAttribute
+from bs4.element import (
+    Comment,
+    Doctype,
+    NamespacedAttribute,
+    ProcessingInstruction,
+)
 from bs4.builder import (
     FAST,
     HTML,
@@ -191,7 +196,9 @@ class LXMLTreeBuilderForXML(TreeBuilder):
             self.nsmaps.pop()
 
     def pi(self, target, data):
-        pass
+        self.soup.endData()
+        self.soup.handle_data(target + ' ' + data)
+        self.soup.endData(ProcessingInstruction)
 
     def data(self, content):
         self.soup.handle_data(content)
diff --git a/bs4/element.py b/bs4/element.py
index 1127c7a..ff716df 100644
--- a/bs4/element.py
+++ b/bs4/element.py
@@ -707,7 +707,7 @@ class CData(PreformattedString):
 class ProcessingInstruction(PreformattedString):
 
     PREFIX = u'<?'
-    SUFFIX = u'?>'
+    SUFFIX = u'>'
 
 class Comment(PreformattedString):
 
diff --git a/bs4/testing.py b/bs4/testing.py
index 3e700f3..023a495 100644
--- a/bs4/testing.py
+++ b/bs4/testing.py
@@ -114,6 +114,11 @@ class HTMLTreeBuilderSmokeTest(object):
             soup.encode("utf-8").replace(b"\n", b""),
             markup.replace(b"\n", b""))
 
+    def test_processing_instruction(self):
+        markup = b"""<?PITarget PIContent?>"""
+        soup = self.soup(markup)
+        self.assertEqual(markup, soup.encode("utf8"))
+
     def test_deepcopy(self):
         """Make sure you can copy the tree builder.
 
diff --git a/bs4/tests/test_html5lib.py b/bs4/tests/test_html5lib.py
index 594c3e1..9a2bacf 100644
--- a/bs4/tests/test_html5lib.py
+++ b/bs4/tests/test_html5lib.py
@@ -83,3 +83,9 @@ class HTML5LibBuilderSmokeTest(SoupTest, HTML5TreeBuilderSmokeTest):
         soup = self.soup(markup)
         self.assertEqual(u"<body><p><em>foo</em></p><em>\n</em><p><em>bar<a></a></em></p>\n</body>", soup.body.decode())
         self.assertEqual(2, len(soup.find_all('p')))
+
+    def test_processing_instruction(self):
+        """Processing instructions become comments."""
+        markup = b"""<?PITarget PIContent?>"""
+        soup = self.soup(markup)
+        assert str(soup).startswith("<!--?PITarget PIContent?-->")
diff --git a/bs4/tests/test_lxml.py b/bs4/tests/test_lxml.py
index 2b2e9b7..a05870b 100644
--- a/bs4/tests/test_lxml.py
+++ b/bs4/tests/test_lxml.py
@@ -65,21 +65,6 @@ class LXMLTreeBuilderSmokeTest(SoupTest, HTMLTreeBuilderSmokeTest):
         self.assertEqual(u"<b/>", unicode(soup.b))
         self.assertTrue("BeautifulStoneSoup class is deprecated" in str(w[0].message))
 
-    def test_real_xhtml_document(self):
-        """lxml strips the XML definition from an XHTML doc, which is fine."""
-        markup = b"""<?xml version="1.0" encoding="utf-8"?>
-<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN">
-<html xmlns="http://www.w3.org/1999/xhtml">
-<head><title>Hello.</title></head>
-<body>Goodbye.</body>
-</html>"""
-        soup = self.soup(markup)
-        self.assertEqual(
-            soup.encode("utf-8").replace(b"\n", b''),
-            markup.replace(b'\n', b'').replace(
-                b'<?xml version="1.0" encoding="utf-8"?>', b''))
-
-
 @skipIf(
     not LXML_PRESENT,
     "lxml seems not to be present, not testing its XML tree builder.")
author	Leonard Richardson <leonardr@segfault.org>	2014-12-11 22:23:26 -0500
committer	Leonard Richardson <leonardr@segfault.org>	2014-12-11 22:23:26 -0500
commit	29f97ad45bfd4a0d1a6b5359b4c10ac9e26a87d5 (patch)
tree	5999e990051702ce838a39dbfc0bda0aa018b9a7
parent	056b9348f66969013c9e48026de69d249f3a101c (diff)
download	beautifulsoup4-29f97ad45bfd4a0d1a6b5359b4c10ac9e26a87d5.tar.gz