diff options
author | Leonard Richardson <leonardr@segfault.org> | 2014-12-11 22:23:26 -0500 |
---|---|---|
committer | Leonard Richardson <leonardr@segfault.org> | 2014-12-11 22:23:26 -0500 |
commit | 29f97ad45bfd4a0d1a6b5359b4c10ac9e26a87d5 (patch) | |
tree | 5999e990051702ce838a39dbfc0bda0aa018b9a7 /bs4/builder | |
parent | 056b9348f66969013c9e48026de69d249f3a101c (diff) | |
download | beautifulsoup4-29f97ad45bfd4a0d1a6b5359b4c10ac9e26a87d5.tar.gz |
Improved the lxml tree builder's handling of processing
instructions. [bug=1294645]
Diffstat (limited to 'bs4/builder')
-rw-r--r-- | bs4/builder/_htmlparser.py | 8 | ||||
-rw-r--r-- | bs4/builder/_lxml.py | 11 |
2 files changed, 9 insertions, 10 deletions
diff --git a/bs4/builder/_htmlparser.py b/bs4/builder/_htmlparser.py index bf231f1..7f3ae73 100644 --- a/bs4/builder/_htmlparser.py +++ b/bs4/builder/_htmlparser.py @@ -112,14 +112,6 @@ class BeautifulSoupHTMLParser(HTMLParser): def handle_pi(self, data): self.soup.endData() - if data.endswith("?") and data.lower().startswith("xml"): - # "An XHTML processing instruction using the trailing '?' - # will cause the '?' to be included in data." - HTMLParser - # docs. - # - # Strip the question mark so we don't end up with two - # question marks. - data = data[:-1] self.soup.handle_data(data) self.soup.endData(ProcessingInstruction) diff --git a/bs4/builder/_lxml.py b/bs4/builder/_lxml.py index 978c8df..b0bc8a0 100644 --- a/bs4/builder/_lxml.py +++ b/bs4/builder/_lxml.py @@ -7,7 +7,12 @@ from io import BytesIO from StringIO import StringIO import collections from lxml import etree -from bs4.element import Comment, Doctype, NamespacedAttribute +from bs4.element import ( + Comment, + Doctype, + NamespacedAttribute, + ProcessingInstruction, +) from bs4.builder import ( FAST, HTML, @@ -191,7 +196,9 @@ class LXMLTreeBuilderForXML(TreeBuilder): self.nsmaps.pop() def pi(self, target, data): - pass + self.soup.endData() + self.soup.handle_data(target + ' ' + data) + self.soup.endData(ProcessingInstruction) def data(self, content): self.soup.handle_data(content) |