summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorLeonard Richardson <leonard.richardson@canonical.com>2011-02-27 18:08:59 -0500
committerLeonard Richardson <leonard.richardson@canonical.com>2011-02-27 18:08:59 -0500
commit3156a689a566966079bba7fb19497314e9184b94 (patch)
tree463a004e531b553f9cfb1ab58507392b99fe5efa
parent8c7a895ff8cfc357543966137a7f71e48a9ea02d (diff)
downloadbeautifulsoup4-3156a689a566966079bba7fb19497314e9184b94.tar.gz
Added a tree builder for the built-in HTMLParser, and tests.
-rw-r--r--bs4/builder/__init__.py6
-rw-r--r--bs4/builder/_htmlparser.py94
-rw-r--r--bs4/dammit.py8
-rw-r--r--tests/test_builder_registry.py8
-rw-r--r--tests/test_htmlparser.py126
-rw-r--r--tests/test_lxml.py23
6 files changed, 238 insertions, 27 deletions
diff --git a/bs4/builder/__init__.py b/bs4/builder/__init__.py
index 10c6b7f..17dcff3 100644
--- a/bs4/builder/__init__.py
+++ b/bs4/builder/__init__.py
@@ -12,6 +12,7 @@ __all__ = [
# Some useful features for a TreeBuilder to have.
FAST = 'fast'
PERMISSIVE = 'permissive'
+STRICT = 'strict'
XML = 'xml'
HTML = 'html'
HTML_5 = 'html5'
@@ -244,7 +245,10 @@ def register_treebuilders_from(module):
# Builders are registered in reverse order of priority, so that custom
# builder registrations will take precedence. In general, we want
-# html5lib to take precedence over lxml, because it's more reliable.
+# html5lib to take precedence over lxml, because it's more
+# reliable. And we only want to use HTMLParser as a last result.
+import _htmlparser
+register_treebuilders_from(_htmlparser)
try:
import _lxml
register_treebuilders_from(_lxml)
diff --git a/bs4/builder/_htmlparser.py b/bs4/builder/_htmlparser.py
new file mode 100644
index 0000000..c293d9e
--- /dev/null
+++ b/bs4/builder/_htmlparser.py
@@ -0,0 +1,94 @@
+"""Use the HTMLParser library to parse HTML files that aren't too bad."""
+
+__all__ = [
+ 'HTMLParserTreeBuilder',
+ ]
+
+from HTMLParser import HTMLParser
+from bs4.element import (
+ CData,
+ Comment,
+ Declaration,
+ Doctype,
+ ProcessingInstruction,
+ )
+from bs4.dammit import EntitySubstitution, UnicodeDammit
+
+from bs4.builder import (
+ HTML,
+ HTMLTreeBuilder,
+ STRICT,
+ )
+
+
+HTMLPARSER = 'html.parser'
+
+class HTMLParserTreeBuilder(HTMLParser, HTMLTreeBuilder):
+
+ is_xml = False
+ features = [HTML, STRICT, HTMLPARSER]
+
+ def prepare_markup(self, markup, user_specified_encoding=None,
+ document_declared_encoding=None):
+ """
+ :return: A 3-tuple (markup, original encoding, encoding
+ declared within markup).
+ """
+ if isinstance(markup, unicode):
+ return markup, None, None
+
+ try_encodings = [user_specified_encoding, document_declared_encoding]
+ dammit = UnicodeDammit(markup, try_encodings, isHTML=True)
+ return (dammit.markup, dammit.original_encoding,
+ dammit.declared_html_encoding)
+
+ def feed(self, markup):
+ super(HTMLParserTreeBuilder, self).feed(markup)
+
+ def handle_starttag(self, name, attrs):
+ self.soup.handle_starttag(name, dict(attrs))
+
+ def handle_endtag(self, name):
+ self.soup.handle_endtag(name)
+
+ def handle_data(self, data):
+ self.soup.handle_data(data)
+
+ def handle_charref(self, name):
+ self.handle_data(unichr(int(name)))
+
+ def handle_entityref(self, name):
+ character = EntitySubstitution.HTML_ENTITY_TO_CHARACTER.get(name)
+ if character is not None:
+ data = character
+ else:
+ data = "&%s;" % name
+ self.handle_data(data)
+
+ def handle_comment(self, data):
+ self.soup.endData()
+ self.soup.handle_data(data)
+ self.soup.endData(Comment)
+
+ def handle_decl(self, data):
+ self.soup.endData()
+ if data.startswith("DOCTYPE "):
+ data = data[len("DOCTYPE "):]
+ self.soup.handle_data(data)
+ self.soup.endData(Doctype)
+
+ def unknown_decl(self, data):
+ if data.upper().startswith('CDATA['):
+ cls = CData
+ data = data[len('CDATA['):]
+ else:
+ cls = Declaration
+ self.soup.endData()
+ self.soup.handle_data(data)
+ self.soup.endData(cls)
+
+ def handle_pi(self, data):
+ self.soup.endData()
+ self.soup.handle_data(data)
+ self.soup.endData(ProcessingInstruction)
+
diff --git a/bs4/dammit.py b/bs4/dammit.py
index 4483118..75d445e 100644
--- a/bs4/dammit.py
+++ b/bs4/dammit.py
@@ -35,6 +35,7 @@ class EntitySubstitution(object):
def _populate_class_variables():
lookup = {}
+ reverse_lookup = {}
characters = []
for codepoint, name in codepoint2name.items():
if codepoint == 34:
@@ -45,10 +46,11 @@ class EntitySubstitution(object):
character = unichr(codepoint)
characters.append(character)
lookup[character] = name
+ reverse_lookup[name] = character
re_definition = "[%s]" % "".join(characters)
- return lookup, re.compile(re_definition)
- CHARACTER_TO_HTML_ENTITY, CHARACTER_TO_HTML_ENTITY_RE = (
- _populate_class_variables())
+ return lookup, reverse_lookup, re.compile(re_definition)
+ (CHARACTER_TO_HTML_ENTITY, HTML_ENTITY_TO_CHARACTER,
+ CHARACTER_TO_HTML_ENTITY_RE) = _populate_class_variables()
CHARACTER_TO_XML_ENTITY = {
diff --git a/tests/test_builder_registry.py b/tests/test_builder_registry.py
index ee5b2da..655cd06 100644
--- a/tests/test_builder_registry.py
+++ b/tests/test_builder_registry.py
@@ -8,7 +8,8 @@ from bs4.builder import (
LXMLTreeBuilderForXML,
LXMLTreeBuilder,
TreeBuilderRegistry,
- HTML5TreeBuilder
+ HTML5TreeBuilder,
+ HTMLParserTreeBuilder,
)
@@ -21,6 +22,8 @@ class BuiltInRegistryTest(unittest.TestCase):
LXMLTreeBuilder)
self.assertEquals(registry.lookup('permissive', 'xml'),
LXMLTreeBuilderForXML)
+ self.assertEquals(registry.lookup('strict', 'html'),
+ HTMLParserTreeBuilder)
self.assertEquals(registry.lookup('permissive', 'html'),
HTML5TreeBuilder)
@@ -36,6 +39,9 @@ class BuiltInRegistryTest(unittest.TestCase):
self.assertEquals(registry.lookup('html5lib'),
HTML5TreeBuilder)
+ self.assertEquals(registry.lookup('html.parser'),
+ HTMLParserTreeBuilder)
+
def test_unimplemented_combinations(self):
self.assertEquals(registry.lookup('fast', 'permissive', 'html'),
None)
diff --git a/tests/test_htmlparser.py b/tests/test_htmlparser.py
new file mode 100644
index 0000000..c8a446e
--- /dev/null
+++ b/tests/test_htmlparser.py
@@ -0,0 +1,126 @@
+from HTMLParser import HTMLParseError
+from bs4.builder import HTMLParserTreeBuilder
+from bs4.element import CData
+from test_lxml import (
+ TestLXMLBuilder,
+ TestLXMLBuilderEncodingConversion,
+ TestLXMLBuilderInvalidMarkup,
+ )
+
+class TestHTMLParserTreeBuilder(TestLXMLBuilder):
+ """See `BuilderSmokeTest`."""
+
+ @property
+ def default_builder(self):
+ return HTMLParserTreeBuilder()
+
+ def test_bare_string(self):
+ # A bare string is turned into some kind of HTML document or
+ # fragment recognizable as the original string.
+ #
+ # HTMLParser does not modify the bare string at all.
+ self.assertSoupEquals("A bare string")
+
+ def test_cdata_where_its_ok(self):
+ # HTMLParser recognizes CDATA sections and passes them through.
+ markup = "<svg><![CDATA[foobar]]></svg>"
+ self.assertSoupEquals(markup)
+ soup = self.soup(markup)
+ string = soup.svg.string
+ self.assertEquals(string, "foobar")
+ self.assertTrue(isinstance(string, CData))
+
+ # These are tests that could be 'fixed' by improving the
+ # HTMLParserTreeBuilder, but I don't think it's worth it. Users
+ # will have fewer headaches if they use one of the other tree
+ # builders.
+
+ def test_empty_element(self):
+ # HTML's empty-element tags are not recognized as such
+ # unless they are presented as empty-element tags.
+ self.assertSoupEquals(
+ "<p>A <meta> tag</p>", "<p>A <meta> tag</meta></p>")
+
+ self.assertSoupEquals(
+ "<p>Foo<br/>bar</p>", "<p>Foo<br />bar</p>")
+
+ def test_entities_in_attribute_values_converted_during_parsing(self):
+
+ # The numeric entity isn't recognized without the closing
+ # semicolon.
+ text = '<x t="pi&#241ata">'
+ expected = u"pi\N{LATIN SMALL LETTER N WITH TILDE}ata"
+ soup = self.soup(text)
+ self.assertEquals(soup.x['t'], "pi&#241ata")
+
+ text = '<x t="pi&#241;ata">'
+ expected = u"pi\N{LATIN SMALL LETTER N WITH TILDE}ata"
+ soup = self.soup(text)
+ self.assertEquals(soup.x['t'], u"pi\xf1ata")
+
+ text = '<x t="pi&#xf1;ata">'
+ soup = self.soup(text)
+ self.assertEquals(soup.x['t'], expected)
+
+ text = '<x t="sacr&eacute; bleu">'
+ soup = self.soup(text)
+ self.assertEquals(
+ soup.x['t'],
+ u"sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu")
+
+ # This can cause valid HTML to become invalid.
+ valid_url = '<a href="http://example.org?a=1&amp;b=2;3">foo</a>'
+ soup = self.soup(valid_url)
+ self.assertEquals(soup.a['href'], "http://example.org?a=1&b=2;3")
+
+ # I think it would be very difficult to 'fix' these tests, judging
+ # from my experience with previous versions of Beautiful Soup.
+ def test_naked_ampersands(self):
+ # Ampersands are treated as entities.
+ text = "<p>AT&T</p>"
+ soup = self.soup(text)
+ self.assertEquals(soup.p.string, "AT&T;")
+
+ def test_literal_in_textarea(self):
+ # Anything inside a <textarea> is supposed to be treated as
+ # the literal value of the field, (XXX citation
+ # needed). html5lib does this correctly. But, HTMLParser does its
+ # best to parse the contents of a <textarea> as HTML.
+ text = '<textarea>Junk like <b> tags and <&<&amp;</textarea>'
+ soup = self.soup(text)
+ self.assertEquals(len(soup.textarea.contents), 2)
+ self.assertEquals(soup.textarea.contents[0], u"Junk like ")
+ self.assertEquals(soup.textarea.contents[1].name, 'b')
+ self.assertEquals(soup.textarea.b.string, u" tags and <&<&")
+
+ def test_literal_in_script(self):
+ # The contents of a <script> tag are supposed to be treated as
+ # a literal string, even if that string contains HTML. But
+ # HTMLParser attempts to parse some of the HTML, causing much
+ # pain.
+ javascript = 'if (i < 2) { alert("<b>foo</b>"); }'
+ soup = self.soup('<script>%s</script>' % javascript)
+ self.assertEquals(soup.script.contents,
+ ['if (i < 2) { alert("<b>foo',
+ '"); }'])
+
+ # Namespaced doctypes cause an HTMLParseError
+ def test_namespaced_system_doctype(self):
+ self.assertRaises(HTMLParseError, self._test_doctype,
+ 'xsl:stylesheet SYSTEM "htmlent.dtd"')
+
+ def test_namespaced_public_doctype(self):
+ self.assertRaises(HTMLParseError, self._test_doctype,
+ 'xsl:stylesheet PUBLIC "htmlent.dtd"')
+
+
+class TestHTMLParserTreeBuilderInvalidMarkup(TestLXMLBuilderInvalidMarkup):
+ # Oddly enough, HTMLParser seems to handle invalid markup exactly
+ # the same as lxml.
+ pass
+
+
+class TestHTMLParserTreeBuilderEncodingConversion(
+ TestLXMLBuilderEncodingConversion):
+ # Re-run the lxml tests for HTMLParser
+ pass
diff --git a/tests/test_lxml.py b/tests/test_lxml.py
index 0eec688..7e83eff 100644
--- a/tests/test_lxml.py
+++ b/tests/test_lxml.py
@@ -168,27 +168,6 @@ class TestLXMLBuilder(SoupTest):
expected = u"<p><<sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>></p>"
self.assertSoupEquals(text, expected)
- def test_entities_in_attribute_values_converted_during_parsing(self):
- text = '<x t="pi&#241ata">'
- expected = u"pi\N{LATIN SMALL LETTER N WITH TILDE}ata"
- soup = self.soup(text)
- self.assertEquals(soup.x['t'], expected)
-
- text = '<x t="pi&#xf1;ata">'
- soup = self.soup(text)
- self.assertEquals(soup.x['t'], expected)
-
- text = '<x t="sacr&eacute; bleu">'
- soup = self.soup(text)
- self.assertEquals(
- soup.x['t'],
- u"sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu")
-
- # This can cause valid HTML to become invalid.
- valid_url = '<a href="http://example.org?a=1&amp;b=2;3">foo</a>'
- soup = self.soup(valid_url)
- self.assertEquals(soup.a['href'], "http://example.org?a=1&b=2;3")
-
def test_smart_quotes_converted_on_the_way_in(self):
# Microsoft smart quotes are converted to Unicode characters during
# parsing.
@@ -230,7 +209,7 @@ class TestLXMLBuilder(SoupTest):
# Test a namespaced doctype with a system id.
self._test_doctype('xsl:stylesheet SYSTEM "htmlent.dtd"')
- def test_namespaced_system_doctype(self):
+ def test_namespaced_public_doctype(self):
# Test a namespaced doctype with a public id.
self._test_doctype('xsl:stylesheet PUBLIC "htmlent.dtd"')