diff options
-rw-r--r-- | NEWS.txt | 7 | ||||
-rw-r--r-- | bs4/builder/_lxml.py | 23 | ||||
-rw-r--r-- | bs4/testing.py | 8 |
3 files changed, 24 insertions, 14 deletions
@@ -1,7 +1,8 @@ -= Unreleased = += 4.5.1 (Unreleased) = -* Fixed a reported (but not duplicated) bug involving processing - instructions fed into the lxml HTML parser. +* Fixed a crash when passing Unicode markup that contained a + processing instruction into the lxml HTML parser on Python + 3. [bug=1608048] = 4.5.0 (20160719) = diff --git a/bs4/builder/_lxml.py b/bs4/builder/_lxml.py index 34bb14e..d2ca287 100644 --- a/bs4/builder/_lxml.py +++ b/bs4/builder/_lxml.py @@ -32,6 +32,7 @@ class LXMLTreeBuilderForXML(TreeBuilder): DEFAULT_PARSER_CLASS = etree.XMLParser is_xml = True + processing_instruction_class = XMLProcessingInstruction NAME = "lxml-xml" ALTERNATE_NAMES = ["xml"] @@ -90,6 +91,16 @@ class LXMLTreeBuilderForXML(TreeBuilder): Each 4-tuple represents a strategy for parsing the document. """ + # Instead of using UnicodeDammit to convert the bytestring to + # Unicode using different encodings, use EncodingDetector to + # iterate over the encodings, and tell lxml to try to parse + # the document as each one in turn. + is_html = not self.is_xml + if is_html: + self.processing_instruction_class = ProcessingInstruction + else: + self.processing_instruction_class = XMLProcessingInstruction + if isinstance(markup, unicode): # We were given Unicode. Maybe lxml can parse Unicode on # this system? @@ -101,16 +112,6 @@ class LXMLTreeBuilderForXML(TreeBuilder): yield (markup.encode("utf8"), "utf8", document_declared_encoding, False) - # Instead of using UnicodeDammit to convert the bytestring to - # Unicode using different encodings, use EncodingDetector to - # iterate over the encodings, and tell lxml to try to parse - # the document as each one in turn. - is_html = not self.is_xml - if is_html: - pass - # self.processing_instruction_class = ProcessingInstruction - else: - self.processing_instruction_class = XMLProcessingInstruction try_encodings = [user_specified_encoding, document_declared_encoding] detector = EncodingDetector( markup, try_encodings, is_html, exclude_encodings) @@ -236,8 +237,8 @@ class LXMLTreeBuilder(HTMLTreeBuilder, LXMLTreeBuilderForXML): ALTERNATE_NAMES = ["lxml-html"] features = ALTERNATE_NAMES + [NAME, HTML, FAST, PERMISSIVE] - processing_instruction_class = ProcessingInstruction is_xml = False + processing_instruction_class = ProcessingInstruction def default_parser(self, encoding): return etree.HTMLParser diff --git a/bs4/testing.py b/bs4/testing.py index 387f775..3a6ed42 100644 --- a/bs4/testing.py +++ b/bs4/testing.py @@ -139,6 +139,14 @@ class HTMLTreeBuilderSmokeTest(object): markup.replace(b"\n", b"")) def test_processing_instruction(self): + # We test both Unicode and bytestring to verify that + # process_markup correctly sets processing_instruction_class + # even when the markup is already Unicode and there is no + # need to process anything. + markup = u"""<?PITarget PIContent?>""" + soup = self.soup(markup) + self.assertEqual(markup, soup.decode()) + markup = b"""<?PITarget PIContent?>""" soup = self.soup(markup) self.assertEqual(markup, soup.encode("utf8")) |