Added a tree builder for the built-in HTMLParser, and tests.

author: Leonard Richardson <leonard.richardson@canonical.com> 2011-02-27 18:08:59 -0500
committer: Leonard Richardson <leonard.richardson@canonical.com> 2011-02-27 18:08:59 -0500
commit: 3156a689a566966079bba7fb19497314e9184b94 (patch)
tree: 463a004e531b553f9cfb1ab58507392b99fe5efa
parent: 8c7a895ff8cfc357543966137a7f71e48a9ea02d (diff)
download: beautifulsoup4-3156a689a566966079bba7fb19497314e9184b94.tar.gz
6 files changed, 238 insertions, 27 deletions
diff --git a/bs4/builder/__init__.py b/bs4/builder/__init__.py
index 10c6b7f..17dcff3 100644
--- a/bs4/builder/__init__.py
+++ b/bs4/builder/__init__.py
@@ -12,6 +12,7 @@ __all__ = [
 # Some useful features for a TreeBuilder to have.
 FAST = 'fast'
 PERMISSIVE = 'permissive'
+STRICT = 'strict'
 XML = 'xml'
 HTML = 'html'
 HTML_5 = 'html5'
@@ -244,7 +245,10 @@ def register_treebuilders_from(module):
 
 # Builders are registered in reverse order of priority, so that custom
 # builder registrations will take precedence. In general, we want
-# html5lib to take precedence over lxml, because it's more reliable.
+# html5lib to take precedence over lxml, because it's more
+# reliable. And we only want to use HTMLParser as a last result.
+import _htmlparser
+register_treebuilders_from(_htmlparser)
 try:
     import _lxml
     register_treebuilders_from(_lxml)
diff --git a/bs4/builder/_htmlparser.py b/bs4/builder/_htmlparser.py
new file mode 100644
index 0000000..c293d9e
--- /dev/null
+++ b/bs4/builder/_htmlparser.py
@@ -0,0 +1,94 @@
+"""Use the HTMLParser library to parse HTML files that aren't too bad."""
+
+__all__ = [
+    'HTMLParserTreeBuilder',
+    ]
+
+from HTMLParser import HTMLParser
+from bs4.element import (
+    CData,
+    Comment,
+    Declaration,
+    Doctype,
+    ProcessingInstruction,
+    )
+from bs4.dammit import EntitySubstitution, UnicodeDammit
+
+from bs4.builder import (
+    HTML,
+    HTMLTreeBuilder,
+    STRICT,
+    )
+
+
+HTMLPARSER = 'html.parser'
+
+class HTMLParserTreeBuilder(HTMLParser, HTMLTreeBuilder):
+
+    is_xml = False
+    features = [HTML, STRICT, HTMLPARSER]
+
+    def prepare_markup(self, markup, user_specified_encoding=None,
+                       document_declared_encoding=None):
+        """
+        :return: A 3-tuple (markup, original encoding, encoding
+        declared within markup).
+        """
+        if isinstance(markup, unicode):
+            return markup, None, None
+
+        try_encodings = [user_specified_encoding, document_declared_encoding]
+        dammit = UnicodeDammit(markup, try_encodings, isHTML=True)
+        return (dammit.markup, dammit.original_encoding,
+                dammit.declared_html_encoding)
+
+    def feed(self, markup):
+        super(HTMLParserTreeBuilder, self).feed(markup)
+
+    def handle_starttag(self, name, attrs):
+        self.soup.handle_starttag(name, dict(attrs))
+
+    def handle_endtag(self, name):
+        self.soup.handle_endtag(name)
+
+    def handle_data(self, data):
+        self.soup.handle_data(data)
+
+    def handle_charref(self, name):
+        self.handle_data(unichr(int(name)))
+
+    def handle_entityref(self, name):
+        character = EntitySubstitution.HTML_ENTITY_TO_CHARACTER.get(name)
+        if character is not None:
+            data = character
+        else:
+            data = "&%s;" % name
+        self.handle_data(data)
+
+    def handle_comment(self, data):
+        self.soup.endData()
+        self.soup.handle_data(data)
+        self.soup.endData(Comment)
+
+    def handle_decl(self, data):
+        self.soup.endData()
+        if data.startswith("DOCTYPE "):
+            data = data[len("DOCTYPE "):]
+        self.soup.handle_data(data)
+        self.soup.endData(Doctype)
+
+    def unknown_decl(self, data):
+        if data.upper().startswith('CDATA['):
+            cls = CData
+            data = data[len('CDATA['):]
+        else:
+            cls = Declaration
+        self.soup.endData()
+        self.soup.handle_data(data)
+        self.soup.endData(cls)
+
+    def handle_pi(self, data):
+        self.soup.endData()
+        self.soup.handle_data(data)
+        self.soup.endData(ProcessingInstruction)
+
diff --git a/bs4/dammit.py b/bs4/dammit.py
index 4483118..75d445e 100644
--- a/bs4/dammit.py
+++ b/bs4/dammit.py
@@ -35,6 +35,7 @@ class EntitySubstitution(object):
 
     def _populate_class_variables():
         lookup = {}
+        reverse_lookup = {}
         characters = []
         for codepoint, name in codepoint2name.items():
             if codepoint == 34:
@@ -45,10 +46,11 @@ class EntitySubstitution(object):
             character = unichr(codepoint)
             characters.append(character)
             lookup[character] = name
+            reverse_lookup[name] = character
         re_definition = "[%s]" % "".join(characters)
-        return lookup, re.compile(re_definition)
-    CHARACTER_TO_HTML_ENTITY, CHARACTER_TO_HTML_ENTITY_RE = (
-        _populate_class_variables())
+        return lookup, reverse_lookup, re.compile(re_definition)
+    (CHARACTER_TO_HTML_ENTITY, HTML_ENTITY_TO_CHARACTER,
+     CHARACTER_TO_HTML_ENTITY_RE) = _populate_class_variables()
 
 
     CHARACTER_TO_XML_ENTITY = {
diff --git a/tests/test_builder_registry.py b/tests/test_builder_registry.py
index ee5b2da..655cd06 100644
--- a/tests/test_builder_registry.py
+++ b/tests/test_builder_registry.py
@@ -8,7 +8,8 @@ from bs4.builder import (
     LXMLTreeBuilderForXML,
     LXMLTreeBuilder,
     TreeBuilderRegistry,
-    HTML5TreeBuilder
+    HTML5TreeBuilder,
+    HTMLParserTreeBuilder,
 )
 
 
@@ -21,6 +22,8 @@ class BuiltInRegistryTest(unittest.TestCase):
                           LXMLTreeBuilder)
         self.assertEquals(registry.lookup('permissive', 'xml'),
                           LXMLTreeBuilderForXML)
+        self.assertEquals(registry.lookup('strict', 'html'),
+                          HTMLParserTreeBuilder)
         self.assertEquals(registry.lookup('permissive', 'html'),
                           HTML5TreeBuilder)
 
@@ -36,6 +39,9 @@ class BuiltInRegistryTest(unittest.TestCase):
         self.assertEquals(registry.lookup('html5lib'),
                           HTML5TreeBuilder)
 
+        self.assertEquals(registry.lookup('html.parser'),
+                          HTMLParserTreeBuilder)
+
     def test_unimplemented_combinations(self):
         self.assertEquals(registry.lookup('fast', 'permissive', 'html'),
                           None)
diff --git a/tests/test_htmlparser.py b/tests/test_htmlparser.py
new file mode 100644
index 0000000..c8a446e
--- /dev/null
+++ b/tests/test_htmlparser.py
@@ -0,0 +1,126 @@
+from HTMLParser import HTMLParseError
+from bs4.builder import HTMLParserTreeBuilder
+from bs4.element import CData
+from test_lxml import (
+    TestLXMLBuilder,
+    TestLXMLBuilderEncodingConversion,
+    TestLXMLBuilderInvalidMarkup,
+    )
+
+class TestHTMLParserTreeBuilder(TestLXMLBuilder):
+    """See `BuilderSmokeTest`."""
+
+    @property
+    def default_builder(self):
+        return HTMLParserTreeBuilder()
+
+    def test_bare_string(self):
+        # A bare string is turned into some kind of HTML document or
+        # fragment recognizable as the original string.
+        #
+        # HTMLParser does not modify the bare string at all.
+        self.assertSoupEquals("A bare string")
+
+    def test_cdata_where_its_ok(self):
+        # HTMLParser recognizes CDATA sections and passes them through.
+        markup = "<svg><![CDATA[foobar]]></svg>"
+        self.assertSoupEquals(markup)
+        soup = self.soup(markup)
+        string = soup.svg.string
+        self.assertEquals(string, "foobar")
+        self.assertTrue(isinstance(string, CData))
+
+    # These are tests that could be 'fixed' by improving the
+    # HTMLParserTreeBuilder, but I don't think it's worth it. Users
+    # will have fewer headaches if they use one of the other tree
+    # builders.
+
+    def test_empty_element(self):
+        # HTML's empty-element tags are not recognized as such
+        # unless they are presented as empty-element tags.
+        self.assertSoupEquals(
+            "<p>A <meta> tag</p>", "<p>A <meta> tag</meta></p>")
+
+        self.assertSoupEquals(
+            "<p>Foo<br/>bar</p>", "<p>Foo<br />bar</p>")
+
+    def test_entities_in_attribute_values_converted_during_parsing(self):
+
+        # The numeric entity isn't recognized without the closing
+        # semicolon.
+        text = '<x t="pi&#241ata">'
+        expected = u"pi\N{LATIN SMALL LETTER N WITH TILDE}ata"
+        soup = self.soup(text)
+        self.assertEquals(soup.x['t'], "pi&#241ata")
+
+        text = '<x t="pi&#241;ata">'
+        expected = u"pi\N{LATIN SMALL LETTER N WITH TILDE}ata"
+        soup = self.soup(text)
+        self.assertEquals(soup.x['t'], u"pi\xf1ata")
+
+        text = '<x t="pi&#xf1;ata">'
+        soup = self.soup(text)
+        self.assertEquals(soup.x['t'], expected)
+
+        text = '<x t="sacr&eacute; bleu">'
+        soup = self.soup(text)
+        self.assertEquals(
+            soup.x['t'],
+            u"sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu")
+
+        # This can cause valid HTML to become invalid.
+        valid_url = '<a href="http://example.org?a=1&amp;b=2;3">foo</a>'
+        soup = self.soup(valid_url)
+        self.assertEquals(soup.a['href'], "http://example.org?a=1&b=2;3")
+
+    # I think it would be very difficult to 'fix' these tests, judging
+    # from my experience with previous versions of Beautiful Soup.
+    def test_naked_ampersands(self):
+        # Ampersands are treated as entities.
+        text = "<p>AT&T</p>"
+        soup = self.soup(text)
+        self.assertEquals(soup.p.string, "AT&T;")
+
+    def test_literal_in_textarea(self):
+        # Anything inside a <textarea> is supposed to be treated as
+        # the literal value of the field, (XXX citation
+        # needed). html5lib does this correctly. But, HTMLParser does its
+        # best to parse the contents of a <textarea> as HTML.
+        text = '<textarea>Junk like <b> tags and <&<&amp;</textarea>'
+        soup = self.soup(text)
+        self.assertEquals(len(soup.textarea.contents), 2)
+        self.assertEquals(soup.textarea.contents[0], u"Junk like ")
+        self.assertEquals(soup.textarea.contents[1].name, 'b')
+        self.assertEquals(soup.textarea.b.string, u" tags and <&<&")
+
+    def test_literal_in_script(self):
+        # The contents of a <script> tag are supposed to be treated as
+        # a literal string, even if that string contains HTML. But
+        # HTMLParser attempts to parse some of the HTML, causing much
+        # pain.
+        javascript = 'if (i < 2) { alert("<b>foo</b>"); }'
+        soup = self.soup('<script>%s</script>' % javascript)
+        self.assertEquals(soup.script.contents,
+                          ['if (i < 2) { alert("<b>foo',
+                           '"); }'])
+
+    # Namespaced doctypes cause an HTMLParseError
+    def test_namespaced_system_doctype(self):
+        self.assertRaises(HTMLParseError, self._test_doctype,
+                          'xsl:stylesheet SYSTEM "htmlent.dtd"')
+
+    def test_namespaced_public_doctype(self):
+        self.assertRaises(HTMLParseError, self._test_doctype,
+                          'xsl:stylesheet PUBLIC "htmlent.dtd"')
+
+
+class TestHTMLParserTreeBuilderInvalidMarkup(TestLXMLBuilderInvalidMarkup):
+    # Oddly enough, HTMLParser seems to handle invalid markup exactly
+    # the same as lxml.
+    pass
+
+
+class TestHTMLParserTreeBuilderEncodingConversion(
+    TestLXMLBuilderEncodingConversion):
+    # Re-run the lxml tests for HTMLParser
+    pass
diff --git a/tests/test_lxml.py b/tests/test_lxml.py
index 0eec688..7e83eff 100644
--- a/tests/test_lxml.py
+++ b/tests/test_lxml.py
@@ -168,27 +168,6 @@ class TestLXMLBuilder(SoupTest):
         expected = u"<p><<sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>></p>"
         self.assertSoupEquals(text, expected)
 
-    def test_entities_in_attribute_values_converted_during_parsing(self):
-        text = '<x t="pi&#241ata">'
-        expected = u"pi\N{LATIN SMALL LETTER N WITH TILDE}ata"
-        soup = self.soup(text)
-        self.assertEquals(soup.x['t'], expected)
-
-        text = '<x t="pi&#xf1;ata">'
-        soup = self.soup(text)
-        self.assertEquals(soup.x['t'], expected)
-
-        text = '<x t="sacr&eacute; bleu">'
-        soup = self.soup(text)
-        self.assertEquals(
-            soup.x['t'],
-            u"sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu")
-
-        # This can cause valid HTML to become invalid.
-        valid_url = '<a href="http://example.org?a=1&amp;b=2;3">foo</a>'
-        soup = self.soup(valid_url)
-        self.assertEquals(soup.a['href'], "http://example.org?a=1&b=2;3")
-
     def test_smart_quotes_converted_on_the_way_in(self):
         # Microsoft smart quotes are converted to Unicode characters during
         # parsing.
@@ -230,7 +209,7 @@ class TestLXMLBuilder(SoupTest):
         # Test a namespaced doctype with a system id.
         self._test_doctype('xsl:stylesheet SYSTEM "htmlent.dtd"')
 
-    def test_namespaced_system_doctype(self):
+    def test_namespaced_public_doctype(self):
         # Test a namespaced doctype with a public id.
         self._test_doctype('xsl:stylesheet PUBLIC "htmlent.dtd"')
author	Leonard Richardson <leonard.richardson@canonical.com>	2011-02-27 18:08:59 -0500
committer	Leonard Richardson <leonard.richardson@canonical.com>	2011-02-27 18:08:59 -0500
commit	3156a689a566966079bba7fb19497314e9184b94 (patch)
tree	463a004e531b553f9cfb1ab58507392b99fe5efa
parent	8c7a895ff8cfc357543966137a7f71e48a9ea02d (diff)
download	beautifulsoup4-3156a689a566966079bba7fb19497314e9184b94.tar.gz