summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorLeonard Richardson <leonardr@segfault.org>2014-12-11 22:23:26 -0500
committerLeonard Richardson <leonardr@segfault.org>2014-12-11 22:23:26 -0500
commit29f97ad45bfd4a0d1a6b5359b4c10ac9e26a87d5 (patch)
tree5999e990051702ce838a39dbfc0bda0aa018b9a7
parent056b9348f66969013c9e48026de69d249f3a101c (diff)
downloadbeautifulsoup4-29f97ad45bfd4a0d1a6b5359b4c10ac9e26a87d5.tar.gz
Improved the lxml tree builder's handling of processing
instructions. [bug=1294645]
-rw-r--r--NEWS.txt3
-rw-r--r--bs4/builder/_htmlparser.py8
-rw-r--r--bs4/builder/_lxml.py11
-rw-r--r--bs4/element.py2
-rw-r--r--bs4/testing.py5
-rw-r--r--bs4/tests/test_html5lib.py6
-rw-r--r--bs4/tests/test_lxml.py15
7 files changed, 24 insertions, 26 deletions
diff --git a/NEWS.txt b/NEWS.txt
index d45e1c4..df2abef 100644
--- a/NEWS.txt
+++ b/NEWS.txt
@@ -11,6 +11,9 @@
* The select() method can now find tags whose names contain
dashes. Patch by Francisco Canas [bug=1276211]
+* Improved the lxml tree builder's handling of processing
+ instructions. [bug=1294645]
+
* Restored the helpful syntax error that happens when you try to
import the Python 2 edition of Beautiful Soup under Python
3. [bug=1213387]
diff --git a/bs4/builder/_htmlparser.py b/bs4/builder/_htmlparser.py
index bf231f1..7f3ae73 100644
--- a/bs4/builder/_htmlparser.py
+++ b/bs4/builder/_htmlparser.py
@@ -112,14 +112,6 @@ class BeautifulSoupHTMLParser(HTMLParser):
def handle_pi(self, data):
self.soup.endData()
- if data.endswith("?") and data.lower().startswith("xml"):
- # "An XHTML processing instruction using the trailing '?'
- # will cause the '?' to be included in data." - HTMLParser
- # docs.
- #
- # Strip the question mark so we don't end up with two
- # question marks.
- data = data[:-1]
self.soup.handle_data(data)
self.soup.endData(ProcessingInstruction)
diff --git a/bs4/builder/_lxml.py b/bs4/builder/_lxml.py
index 978c8df..b0bc8a0 100644
--- a/bs4/builder/_lxml.py
+++ b/bs4/builder/_lxml.py
@@ -7,7 +7,12 @@ from io import BytesIO
from StringIO import StringIO
import collections
from lxml import etree
-from bs4.element import Comment, Doctype, NamespacedAttribute
+from bs4.element import (
+ Comment,
+ Doctype,
+ NamespacedAttribute,
+ ProcessingInstruction,
+)
from bs4.builder import (
FAST,
HTML,
@@ -191,7 +196,9 @@ class LXMLTreeBuilderForXML(TreeBuilder):
self.nsmaps.pop()
def pi(self, target, data):
- pass
+ self.soup.endData()
+ self.soup.handle_data(target + ' ' + data)
+ self.soup.endData(ProcessingInstruction)
def data(self, content):
self.soup.handle_data(content)
diff --git a/bs4/element.py b/bs4/element.py
index 1127c7a..ff716df 100644
--- a/bs4/element.py
+++ b/bs4/element.py
@@ -707,7 +707,7 @@ class CData(PreformattedString):
class ProcessingInstruction(PreformattedString):
PREFIX = u'<?'
- SUFFIX = u'?>'
+ SUFFIX = u'>'
class Comment(PreformattedString):
diff --git a/bs4/testing.py b/bs4/testing.py
index 3e700f3..023a495 100644
--- a/bs4/testing.py
+++ b/bs4/testing.py
@@ -114,6 +114,11 @@ class HTMLTreeBuilderSmokeTest(object):
soup.encode("utf-8").replace(b"\n", b""),
markup.replace(b"\n", b""))
+ def test_processing_instruction(self):
+ markup = b"""<?PITarget PIContent?>"""
+ soup = self.soup(markup)
+ self.assertEqual(markup, soup.encode("utf8"))
+
def test_deepcopy(self):
"""Make sure you can copy the tree builder.
diff --git a/bs4/tests/test_html5lib.py b/bs4/tests/test_html5lib.py
index 594c3e1..9a2bacf 100644
--- a/bs4/tests/test_html5lib.py
+++ b/bs4/tests/test_html5lib.py
@@ -83,3 +83,9 @@ class HTML5LibBuilderSmokeTest(SoupTest, HTML5TreeBuilderSmokeTest):
soup = self.soup(markup)
self.assertEqual(u"<body><p><em>foo</em></p><em>\n</em><p><em>bar<a></a></em></p>\n</body>", soup.body.decode())
self.assertEqual(2, len(soup.find_all('p')))
+
+ def test_processing_instruction(self):
+ """Processing instructions become comments."""
+ markup = b"""<?PITarget PIContent?>"""
+ soup = self.soup(markup)
+ assert str(soup).startswith("<!--?PITarget PIContent?-->")
diff --git a/bs4/tests/test_lxml.py b/bs4/tests/test_lxml.py
index 2b2e9b7..a05870b 100644
--- a/bs4/tests/test_lxml.py
+++ b/bs4/tests/test_lxml.py
@@ -65,21 +65,6 @@ class LXMLTreeBuilderSmokeTest(SoupTest, HTMLTreeBuilderSmokeTest):
self.assertEqual(u"<b/>", unicode(soup.b))
self.assertTrue("BeautifulStoneSoup class is deprecated" in str(w[0].message))
- def test_real_xhtml_document(self):
- """lxml strips the XML definition from an XHTML doc, which is fine."""
- markup = b"""<?xml version="1.0" encoding="utf-8"?>
-<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN">
-<html xmlns="http://www.w3.org/1999/xhtml">
-<head><title>Hello.</title></head>
-<body>Goodbye.</body>
-</html>"""
- soup = self.soup(markup)
- self.assertEqual(
- soup.encode("utf-8").replace(b"\n", b''),
- markup.replace(b'\n', b'').replace(
- b'<?xml version="1.0" encoding="utf-8"?>', b''))
-
-
@skipIf(
not LXML_PRESENT,
"lxml seems not to be present, not testing its XML tree builder.")