summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorLeonard Richardson <leonardr@segfault.org>2016-07-30 07:55:09 -0400
committerLeonard Richardson <leonardr@segfault.org>2016-07-30 07:55:09 -0400
commitc8781c7d6c5ca5e88af6a465b90f334893578531 (patch)
tree32163b784b6b30404495000a1873064b2e5b3498
parent21d4cc074cc68fa3a37d04b746e19810cd5f296f (diff)
downloadbeautifulsoup4-c8781c7d6c5ca5e88af6a465b90f334893578531.tar.gz
Explained why we test both unicode and bytestring processing instructions.
-rw-r--r--NEWS.txt7
-rw-r--r--bs4/builder/_lxml.py23
-rw-r--r--bs4/testing.py8
3 files changed, 24 insertions, 14 deletions
diff --git a/NEWS.txt b/NEWS.txt
index e46452d..e27004f 100644
--- a/NEWS.txt
+++ b/NEWS.txt
@@ -1,7 +1,8 @@
-= Unreleased =
+= 4.5.1 (Unreleased) =
-* Fixed a reported (but not duplicated) bug involving processing
- instructions fed into the lxml HTML parser.
+* Fixed a crash when passing Unicode markup that contained a
+ processing instruction into the lxml HTML parser on Python
+ 3. [bug=1608048]
= 4.5.0 (20160719) =
diff --git a/bs4/builder/_lxml.py b/bs4/builder/_lxml.py
index 34bb14e..d2ca287 100644
--- a/bs4/builder/_lxml.py
+++ b/bs4/builder/_lxml.py
@@ -32,6 +32,7 @@ class LXMLTreeBuilderForXML(TreeBuilder):
DEFAULT_PARSER_CLASS = etree.XMLParser
is_xml = True
+ processing_instruction_class = XMLProcessingInstruction
NAME = "lxml-xml"
ALTERNATE_NAMES = ["xml"]
@@ -90,6 +91,16 @@ class LXMLTreeBuilderForXML(TreeBuilder):
Each 4-tuple represents a strategy for parsing the document.
"""
+ # Instead of using UnicodeDammit to convert the bytestring to
+ # Unicode using different encodings, use EncodingDetector to
+ # iterate over the encodings, and tell lxml to try to parse
+ # the document as each one in turn.
+ is_html = not self.is_xml
+ if is_html:
+ self.processing_instruction_class = ProcessingInstruction
+ else:
+ self.processing_instruction_class = XMLProcessingInstruction
+
if isinstance(markup, unicode):
# We were given Unicode. Maybe lxml can parse Unicode on
# this system?
@@ -101,16 +112,6 @@ class LXMLTreeBuilderForXML(TreeBuilder):
yield (markup.encode("utf8"), "utf8",
document_declared_encoding, False)
- # Instead of using UnicodeDammit to convert the bytestring to
- # Unicode using different encodings, use EncodingDetector to
- # iterate over the encodings, and tell lxml to try to parse
- # the document as each one in turn.
- is_html = not self.is_xml
- if is_html:
- pass
- # self.processing_instruction_class = ProcessingInstruction
- else:
- self.processing_instruction_class = XMLProcessingInstruction
try_encodings = [user_specified_encoding, document_declared_encoding]
detector = EncodingDetector(
markup, try_encodings, is_html, exclude_encodings)
@@ -236,8 +237,8 @@ class LXMLTreeBuilder(HTMLTreeBuilder, LXMLTreeBuilderForXML):
ALTERNATE_NAMES = ["lxml-html"]
features = ALTERNATE_NAMES + [NAME, HTML, FAST, PERMISSIVE]
- processing_instruction_class = ProcessingInstruction
is_xml = False
+ processing_instruction_class = ProcessingInstruction
def default_parser(self, encoding):
return etree.HTMLParser
diff --git a/bs4/testing.py b/bs4/testing.py
index 387f775..3a6ed42 100644
--- a/bs4/testing.py
+++ b/bs4/testing.py
@@ -139,6 +139,14 @@ class HTMLTreeBuilderSmokeTest(object):
markup.replace(b"\n", b""))
def test_processing_instruction(self):
+ # We test both Unicode and bytestring to verify that
+ # process_markup correctly sets processing_instruction_class
+ # even when the markup is already Unicode and there is no
+ # need to process anything.
+ markup = u"""<?PITarget PIContent?>"""
+ soup = self.soup(markup)
+ self.assertEqual(markup, soup.decode())
+
markup = b"""<?PITarget PIContent?>"""
soup = self.soup(markup)
self.assertEqual(markup, soup.encode("utf8"))