diff options
author | Leonard Richardson <leonard.richardson@canonical.com> | 2011-02-27 18:08:59 -0500 |
---|---|---|
committer | Leonard Richardson <leonard.richardson@canonical.com> | 2011-02-27 18:08:59 -0500 |
commit | 3156a689a566966079bba7fb19497314e9184b94 (patch) | |
tree | 463a004e531b553f9cfb1ab58507392b99fe5efa | |
parent | 8c7a895ff8cfc357543966137a7f71e48a9ea02d (diff) | |
download | beautifulsoup4-3156a689a566966079bba7fb19497314e9184b94.tar.gz |
Added a tree builder for the built-in HTMLParser, and tests.
-rw-r--r-- | bs4/builder/__init__.py | 6 | ||||
-rw-r--r-- | bs4/builder/_htmlparser.py | 94 | ||||
-rw-r--r-- | bs4/dammit.py | 8 | ||||
-rw-r--r-- | tests/test_builder_registry.py | 8 | ||||
-rw-r--r-- | tests/test_htmlparser.py | 126 | ||||
-rw-r--r-- | tests/test_lxml.py | 23 |
6 files changed, 238 insertions, 27 deletions
diff --git a/bs4/builder/__init__.py b/bs4/builder/__init__.py index 10c6b7f..17dcff3 100644 --- a/bs4/builder/__init__.py +++ b/bs4/builder/__init__.py @@ -12,6 +12,7 @@ __all__ = [ # Some useful features for a TreeBuilder to have. FAST = 'fast' PERMISSIVE = 'permissive' +STRICT = 'strict' XML = 'xml' HTML = 'html' HTML_5 = 'html5' @@ -244,7 +245,10 @@ def register_treebuilders_from(module): # Builders are registered in reverse order of priority, so that custom # builder registrations will take precedence. In general, we want -# html5lib to take precedence over lxml, because it's more reliable. +# html5lib to take precedence over lxml, because it's more +# reliable. And we only want to use HTMLParser as a last result. +import _htmlparser +register_treebuilders_from(_htmlparser) try: import _lxml register_treebuilders_from(_lxml) diff --git a/bs4/builder/_htmlparser.py b/bs4/builder/_htmlparser.py new file mode 100644 index 0000000..c293d9e --- /dev/null +++ b/bs4/builder/_htmlparser.py @@ -0,0 +1,94 @@ +"""Use the HTMLParser library to parse HTML files that aren't too bad.""" + +__all__ = [ + 'HTMLParserTreeBuilder', + ] + +from HTMLParser import HTMLParser +from bs4.element import ( + CData, + Comment, + Declaration, + Doctype, + ProcessingInstruction, + ) +from bs4.dammit import EntitySubstitution, UnicodeDammit + +from bs4.builder import ( + HTML, + HTMLTreeBuilder, + STRICT, + ) + + +HTMLPARSER = 'html.parser' + +class HTMLParserTreeBuilder(HTMLParser, HTMLTreeBuilder): + + is_xml = False + features = [HTML, STRICT, HTMLPARSER] + + def prepare_markup(self, markup, user_specified_encoding=None, + document_declared_encoding=None): + """ + :return: A 3-tuple (markup, original encoding, encoding + declared within markup). + """ + if isinstance(markup, unicode): + return markup, None, None + + try_encodings = [user_specified_encoding, document_declared_encoding] + dammit = UnicodeDammit(markup, try_encodings, isHTML=True) + return (dammit.markup, dammit.original_encoding, + dammit.declared_html_encoding) + + def feed(self, markup): + super(HTMLParserTreeBuilder, self).feed(markup) + + def handle_starttag(self, name, attrs): + self.soup.handle_starttag(name, dict(attrs)) + + def handle_endtag(self, name): + self.soup.handle_endtag(name) + + def handle_data(self, data): + self.soup.handle_data(data) + + def handle_charref(self, name): + self.handle_data(unichr(int(name))) + + def handle_entityref(self, name): + character = EntitySubstitution.HTML_ENTITY_TO_CHARACTER.get(name) + if character is not None: + data = character + else: + data = "&%s;" % name + self.handle_data(data) + + def handle_comment(self, data): + self.soup.endData() + self.soup.handle_data(data) + self.soup.endData(Comment) + + def handle_decl(self, data): + self.soup.endData() + if data.startswith("DOCTYPE "): + data = data[len("DOCTYPE "):] + self.soup.handle_data(data) + self.soup.endData(Doctype) + + def unknown_decl(self, data): + if data.upper().startswith('CDATA['): + cls = CData + data = data[len('CDATA['):] + else: + cls = Declaration + self.soup.endData() + self.soup.handle_data(data) + self.soup.endData(cls) + + def handle_pi(self, data): + self.soup.endData() + self.soup.handle_data(data) + self.soup.endData(ProcessingInstruction) + diff --git a/bs4/dammit.py b/bs4/dammit.py index 4483118..75d445e 100644 --- a/bs4/dammit.py +++ b/bs4/dammit.py @@ -35,6 +35,7 @@ class EntitySubstitution(object): def _populate_class_variables(): lookup = {} + reverse_lookup = {} characters = [] for codepoint, name in codepoint2name.items(): if codepoint == 34: @@ -45,10 +46,11 @@ class EntitySubstitution(object): character = unichr(codepoint) characters.append(character) lookup[character] = name + reverse_lookup[name] = character re_definition = "[%s]" % "".join(characters) - return lookup, re.compile(re_definition) - CHARACTER_TO_HTML_ENTITY, CHARACTER_TO_HTML_ENTITY_RE = ( - _populate_class_variables()) + return lookup, reverse_lookup, re.compile(re_definition) + (CHARACTER_TO_HTML_ENTITY, HTML_ENTITY_TO_CHARACTER, + CHARACTER_TO_HTML_ENTITY_RE) = _populate_class_variables() CHARACTER_TO_XML_ENTITY = { diff --git a/tests/test_builder_registry.py b/tests/test_builder_registry.py index ee5b2da..655cd06 100644 --- a/tests/test_builder_registry.py +++ b/tests/test_builder_registry.py @@ -8,7 +8,8 @@ from bs4.builder import ( LXMLTreeBuilderForXML, LXMLTreeBuilder, TreeBuilderRegistry, - HTML5TreeBuilder + HTML5TreeBuilder, + HTMLParserTreeBuilder, ) @@ -21,6 +22,8 @@ class BuiltInRegistryTest(unittest.TestCase): LXMLTreeBuilder) self.assertEquals(registry.lookup('permissive', 'xml'), LXMLTreeBuilderForXML) + self.assertEquals(registry.lookup('strict', 'html'), + HTMLParserTreeBuilder) self.assertEquals(registry.lookup('permissive', 'html'), HTML5TreeBuilder) @@ -36,6 +39,9 @@ class BuiltInRegistryTest(unittest.TestCase): self.assertEquals(registry.lookup('html5lib'), HTML5TreeBuilder) + self.assertEquals(registry.lookup('html.parser'), + HTMLParserTreeBuilder) + def test_unimplemented_combinations(self): self.assertEquals(registry.lookup('fast', 'permissive', 'html'), None) diff --git a/tests/test_htmlparser.py b/tests/test_htmlparser.py new file mode 100644 index 0000000..c8a446e --- /dev/null +++ b/tests/test_htmlparser.py @@ -0,0 +1,126 @@ +from HTMLParser import HTMLParseError +from bs4.builder import HTMLParserTreeBuilder +from bs4.element import CData +from test_lxml import ( + TestLXMLBuilder, + TestLXMLBuilderEncodingConversion, + TestLXMLBuilderInvalidMarkup, + ) + +class TestHTMLParserTreeBuilder(TestLXMLBuilder): + """See `BuilderSmokeTest`.""" + + @property + def default_builder(self): + return HTMLParserTreeBuilder() + + def test_bare_string(self): + # A bare string is turned into some kind of HTML document or + # fragment recognizable as the original string. + # + # HTMLParser does not modify the bare string at all. + self.assertSoupEquals("A bare string") + + def test_cdata_where_its_ok(self): + # HTMLParser recognizes CDATA sections and passes them through. + markup = "<svg><![CDATA[foobar]]></svg>" + self.assertSoupEquals(markup) + soup = self.soup(markup) + string = soup.svg.string + self.assertEquals(string, "foobar") + self.assertTrue(isinstance(string, CData)) + + # These are tests that could be 'fixed' by improving the + # HTMLParserTreeBuilder, but I don't think it's worth it. Users + # will have fewer headaches if they use one of the other tree + # builders. + + def test_empty_element(self): + # HTML's empty-element tags are not recognized as such + # unless they are presented as empty-element tags. + self.assertSoupEquals( + "<p>A <meta> tag</p>", "<p>A <meta> tag</meta></p>") + + self.assertSoupEquals( + "<p>Foo<br/>bar</p>", "<p>Foo<br />bar</p>") + + def test_entities_in_attribute_values_converted_during_parsing(self): + + # The numeric entity isn't recognized without the closing + # semicolon. + text = '<x t="piñata">' + expected = u"pi\N{LATIN SMALL LETTER N WITH TILDE}ata" + soup = self.soup(text) + self.assertEquals(soup.x['t'], "piñata") + + text = '<x t="piñata">' + expected = u"pi\N{LATIN SMALL LETTER N WITH TILDE}ata" + soup = self.soup(text) + self.assertEquals(soup.x['t'], u"pi\xf1ata") + + text = '<x t="piñata">' + soup = self.soup(text) + self.assertEquals(soup.x['t'], expected) + + text = '<x t="sacré bleu">' + soup = self.soup(text) + self.assertEquals( + soup.x['t'], + u"sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu") + + # This can cause valid HTML to become invalid. + valid_url = '<a href="http://example.org?a=1&b=2;3">foo</a>' + soup = self.soup(valid_url) + self.assertEquals(soup.a['href'], "http://example.org?a=1&b=2;3") + + # I think it would be very difficult to 'fix' these tests, judging + # from my experience with previous versions of Beautiful Soup. + def test_naked_ampersands(self): + # Ampersands are treated as entities. + text = "<p>AT&T</p>" + soup = self.soup(text) + self.assertEquals(soup.p.string, "AT&T;") + + def test_literal_in_textarea(self): + # Anything inside a <textarea> is supposed to be treated as + # the literal value of the field, (XXX citation + # needed). html5lib does this correctly. But, HTMLParser does its + # best to parse the contents of a <textarea> as HTML. + text = '<textarea>Junk like <b> tags and <&<&</textarea>' + soup = self.soup(text) + self.assertEquals(len(soup.textarea.contents), 2) + self.assertEquals(soup.textarea.contents[0], u"Junk like ") + self.assertEquals(soup.textarea.contents[1].name, 'b') + self.assertEquals(soup.textarea.b.string, u" tags and <&<&") + + def test_literal_in_script(self): + # The contents of a <script> tag are supposed to be treated as + # a literal string, even if that string contains HTML. But + # HTMLParser attempts to parse some of the HTML, causing much + # pain. + javascript = 'if (i < 2) { alert("<b>foo</b>"); }' + soup = self.soup('<script>%s</script>' % javascript) + self.assertEquals(soup.script.contents, + ['if (i < 2) { alert("<b>foo', + '"); }']) + + # Namespaced doctypes cause an HTMLParseError + def test_namespaced_system_doctype(self): + self.assertRaises(HTMLParseError, self._test_doctype, + 'xsl:stylesheet SYSTEM "htmlent.dtd"') + + def test_namespaced_public_doctype(self): + self.assertRaises(HTMLParseError, self._test_doctype, + 'xsl:stylesheet PUBLIC "htmlent.dtd"') + + +class TestHTMLParserTreeBuilderInvalidMarkup(TestLXMLBuilderInvalidMarkup): + # Oddly enough, HTMLParser seems to handle invalid markup exactly + # the same as lxml. + pass + + +class TestHTMLParserTreeBuilderEncodingConversion( + TestLXMLBuilderEncodingConversion): + # Re-run the lxml tests for HTMLParser + pass diff --git a/tests/test_lxml.py b/tests/test_lxml.py index 0eec688..7e83eff 100644 --- a/tests/test_lxml.py +++ b/tests/test_lxml.py @@ -168,27 +168,6 @@ class TestLXMLBuilder(SoupTest): expected = u"<p><<sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>></p>" self.assertSoupEquals(text, expected) - def test_entities_in_attribute_values_converted_during_parsing(self): - text = '<x t="piñata">' - expected = u"pi\N{LATIN SMALL LETTER N WITH TILDE}ata" - soup = self.soup(text) - self.assertEquals(soup.x['t'], expected) - - text = '<x t="piñata">' - soup = self.soup(text) - self.assertEquals(soup.x['t'], expected) - - text = '<x t="sacré bleu">' - soup = self.soup(text) - self.assertEquals( - soup.x['t'], - u"sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu") - - # This can cause valid HTML to become invalid. - valid_url = '<a href="http://example.org?a=1&b=2;3">foo</a>' - soup = self.soup(valid_url) - self.assertEquals(soup.a['href'], "http://example.org?a=1&b=2;3") - def test_smart_quotes_converted_on_the_way_in(self): # Microsoft smart quotes are converted to Unicode characters during # parsing. @@ -230,7 +209,7 @@ class TestLXMLBuilder(SoupTest): # Test a namespaced doctype with a system id. self._test_doctype('xsl:stylesheet SYSTEM "htmlent.dtd"') - def test_namespaced_system_doctype(self): + def test_namespaced_public_doctype(self): # Test a namespaced doctype with a public id. self._test_doctype('xsl:stylesheet PUBLIC "htmlent.dtd"') |