Explained why we test both unicode and bytestring processing instructions.

author: Leonard Richardson <leonardr@segfault.org> 2016-07-30 07:55:09 -0400
committer: Leonard Richardson <leonardr@segfault.org> 2016-07-30 07:55:09 -0400
commit: c8781c7d6c5ca5e88af6a465b90f334893578531 (patch)
tree: 32163b784b6b30404495000a1873064b2e5b3498
parent: 21d4cc074cc68fa3a37d04b746e19810cd5f296f (diff)
download: beautifulsoup4-c8781c7d6c5ca5e88af6a465b90f334893578531.tar.gz
3 files changed, 24 insertions, 14 deletions
diff --git a/NEWS.txt b/NEWS.txt
index e46452d..e27004f 100644
--- a/NEWS.txt
+++ b/NEWS.txt
@@ -1,7 +1,8 @@
-= Unreleased =
+= 4.5.1 (Unreleased) =
 
-* Fixed a reported (but not duplicated) bug involving processing
-  instructions fed into the lxml HTML parser.
+* Fixed a crash when passing Unicode markup that contained a
+  processing instruction into the lxml HTML parser on Python
+  3. [bug=1608048]
 
 = 4.5.0 (20160719) =
 
diff --git a/bs4/builder/_lxml.py b/bs4/builder/_lxml.py
index 34bb14e..d2ca287 100644
--- a/bs4/builder/_lxml.py
+++ b/bs4/builder/_lxml.py
@@ -32,6 +32,7 @@ class LXMLTreeBuilderForXML(TreeBuilder):
     DEFAULT_PARSER_CLASS = etree.XMLParser
 
     is_xml = True
+    processing_instruction_class = XMLProcessingInstruction
 
     NAME = "lxml-xml"
     ALTERNATE_NAMES = ["xml"]
@@ -90,6 +91,16 @@ class LXMLTreeBuilderForXML(TreeBuilder):
 
         Each 4-tuple represents a strategy for parsing the document.
         """
+        # Instead of using UnicodeDammit to convert the bytestring to
+        # Unicode using different encodings, use EncodingDetector to
+        # iterate over the encodings, and tell lxml to try to parse
+        # the document as each one in turn.
+        is_html = not self.is_xml
+        if is_html:
+            self.processing_instruction_class = ProcessingInstruction
+        else:
+            self.processing_instruction_class = XMLProcessingInstruction
+
         if isinstance(markup, unicode):
             # We were given Unicode. Maybe lxml can parse Unicode on
             # this system?
@@ -101,16 +112,6 @@ class LXMLTreeBuilderForXML(TreeBuilder):
             yield (markup.encode("utf8"), "utf8",
                    document_declared_encoding, False)
 
-        # Instead of using UnicodeDammit to convert the bytestring to
-        # Unicode using different encodings, use EncodingDetector to
-        # iterate over the encodings, and tell lxml to try to parse
-        # the document as each one in turn.
-        is_html = not self.is_xml
-        if is_html:
-            pass
-            # self.processing_instruction_class = ProcessingInstruction
-        else:
-            self.processing_instruction_class = XMLProcessingInstruction
         try_encodings = [user_specified_encoding, document_declared_encoding]
         detector = EncodingDetector(
             markup, try_encodings, is_html, exclude_encodings)
@@ -236,8 +237,8 @@ class LXMLTreeBuilder(HTMLTreeBuilder, LXMLTreeBuilderForXML):
     ALTERNATE_NAMES = ["lxml-html"]
 
     features = ALTERNATE_NAMES + [NAME, HTML, FAST, PERMISSIVE]
-    processing_instruction_class = ProcessingInstruction
     is_xml = False
+    processing_instruction_class = ProcessingInstruction
 
     def default_parser(self, encoding):
         return etree.HTMLParser
diff --git a/bs4/testing.py b/bs4/testing.py
index 387f775..3a6ed42 100644
--- a/bs4/testing.py
+++ b/bs4/testing.py
@@ -139,6 +139,14 @@ class HTMLTreeBuilderSmokeTest(object):
             markup.replace(b"\n", b""))
 
     def test_processing_instruction(self):
+        # We test both Unicode and bytestring to verify that
+        # process_markup correctly sets processing_instruction_class
+        # even when the markup is already Unicode and there is no
+        # need to process anything.
+        markup = u"""<?PITarget PIContent?>"""
+        soup = self.soup(markup)
+        self.assertEqual(markup, soup.decode())
+
         markup = b"""<?PITarget PIContent?>"""
         soup = self.soup(markup)
         self.assertEqual(markup, soup.encode("utf8"))
author	Leonard Richardson <leonardr@segfault.org>	2016-07-30 07:55:09 -0400
committer	Leonard Richardson <leonardr@segfault.org>	2016-07-30 07:55:09 -0400
commit	c8781c7d6c5ca5e88af6a465b90f334893578531 (patch)
tree	32163b784b6b30404495000a1873064b2e5b3498
parent	21d4cc074cc68fa3a37d04b746e19810cd5f296f (diff)
download	beautifulsoup4-c8781c7d6c5ca5e88af6a465b90f334893578531.tar.gz