From c8781c7d6c5ca5e88af6a465b90f334893578531 Mon Sep 17 00:00:00 2001 From: Leonard Richardson Date: Sat, 30 Jul 2016 07:55:09 -0400 Subject: Explained why we test both unicode and bytestring processing instructions. --- NEWS.txt | 7 ++++--- bs4/builder/_lxml.py | 23 ++++++++++++----------- bs4/testing.py | 8 ++++++++ 3 files changed, 24 insertions(+), 14 deletions(-) diff --git a/NEWS.txt b/NEWS.txt index e46452d..e27004f 100644 --- a/NEWS.txt +++ b/NEWS.txt @@ -1,7 +1,8 @@ -= Unreleased = += 4.5.1 (Unreleased) = -* Fixed a reported (but not duplicated) bug involving processing - instructions fed into the lxml HTML parser. +* Fixed a crash when passing Unicode markup that contained a + processing instruction into the lxml HTML parser on Python + 3. [bug=1608048] = 4.5.0 (20160719) = diff --git a/bs4/builder/_lxml.py b/bs4/builder/_lxml.py index 34bb14e..d2ca287 100644 --- a/bs4/builder/_lxml.py +++ b/bs4/builder/_lxml.py @@ -32,6 +32,7 @@ class LXMLTreeBuilderForXML(TreeBuilder): DEFAULT_PARSER_CLASS = etree.XMLParser is_xml = True + processing_instruction_class = XMLProcessingInstruction NAME = "lxml-xml" ALTERNATE_NAMES = ["xml"] @@ -90,6 +91,16 @@ class LXMLTreeBuilderForXML(TreeBuilder): Each 4-tuple represents a strategy for parsing the document. """ + # Instead of using UnicodeDammit to convert the bytestring to + # Unicode using different encodings, use EncodingDetector to + # iterate over the encodings, and tell lxml to try to parse + # the document as each one in turn. + is_html = not self.is_xml + if is_html: + self.processing_instruction_class = ProcessingInstruction + else: + self.processing_instruction_class = XMLProcessingInstruction + if isinstance(markup, unicode): # We were given Unicode. Maybe lxml can parse Unicode on # this system? @@ -101,16 +112,6 @@ class LXMLTreeBuilderForXML(TreeBuilder): yield (markup.encode("utf8"), "utf8", document_declared_encoding, False) - # Instead of using UnicodeDammit to convert the bytestring to - # Unicode using different encodings, use EncodingDetector to - # iterate over the encodings, and tell lxml to try to parse - # the document as each one in turn. - is_html = not self.is_xml - if is_html: - pass - # self.processing_instruction_class = ProcessingInstruction - else: - self.processing_instruction_class = XMLProcessingInstruction try_encodings = [user_specified_encoding, document_declared_encoding] detector = EncodingDetector( markup, try_encodings, is_html, exclude_encodings) @@ -236,8 +237,8 @@ class LXMLTreeBuilder(HTMLTreeBuilder, LXMLTreeBuilderForXML): ALTERNATE_NAMES = ["lxml-html"] features = ALTERNATE_NAMES + [NAME, HTML, FAST, PERMISSIVE] - processing_instruction_class = ProcessingInstruction is_xml = False + processing_instruction_class = ProcessingInstruction def default_parser(self, encoding): return etree.HTMLParser diff --git a/bs4/testing.py b/bs4/testing.py index 387f775..3a6ed42 100644 --- a/bs4/testing.py +++ b/bs4/testing.py @@ -139,6 +139,14 @@ class HTMLTreeBuilderSmokeTest(object): markup.replace(b"\n", b"")) def test_processing_instruction(self): + # We test both Unicode and bytestring to verify that + # process_markup correctly sets processing_instruction_class + # even when the markup is already Unicode and there is no + # need to process anything. + markup = u"""""" + soup = self.soup(markup) + self.assertEqual(markup, soup.decode()) + markup = b"""""" soup = self.soup(markup) self.assertEqual(markup, soup.encode("utf8")) -- cgit v1.2.1