summaryrefslogtreecommitdiff
path: root/bs4
diff options
context:
space:
mode:
authorLeonard Richardson <leonard.richardson@canonical.com>2011-02-27 18:08:59 -0500
committerLeonard Richardson <leonard.richardson@canonical.com>2011-02-27 18:08:59 -0500
commit3156a689a566966079bba7fb19497314e9184b94 (patch)
tree463a004e531b553f9cfb1ab58507392b99fe5efa /bs4
parent8c7a895ff8cfc357543966137a7f71e48a9ea02d (diff)
downloadbeautifulsoup4-3156a689a566966079bba7fb19497314e9184b94.tar.gz
Added a tree builder for the built-in HTMLParser, and tests.
Diffstat (limited to 'bs4')
-rw-r--r--bs4/builder/__init__.py6
-rw-r--r--bs4/builder/_htmlparser.py94
-rw-r--r--bs4/dammit.py8
3 files changed, 104 insertions, 4 deletions
diff --git a/bs4/builder/__init__.py b/bs4/builder/__init__.py
index 10c6b7f..17dcff3 100644
--- a/bs4/builder/__init__.py
+++ b/bs4/builder/__init__.py
@@ -12,6 +12,7 @@ __all__ = [
# Some useful features for a TreeBuilder to have.
FAST = 'fast'
PERMISSIVE = 'permissive'
+STRICT = 'strict'
XML = 'xml'
HTML = 'html'
HTML_5 = 'html5'
@@ -244,7 +245,10 @@ def register_treebuilders_from(module):
# Builders are registered in reverse order of priority, so that custom
# builder registrations will take precedence. In general, we want
-# html5lib to take precedence over lxml, because it's more reliable.
+# html5lib to take precedence over lxml, because it's more
+# reliable. And we only want to use HTMLParser as a last result.
+import _htmlparser
+register_treebuilders_from(_htmlparser)
try:
import _lxml
register_treebuilders_from(_lxml)
diff --git a/bs4/builder/_htmlparser.py b/bs4/builder/_htmlparser.py
new file mode 100644
index 0000000..c293d9e
--- /dev/null
+++ b/bs4/builder/_htmlparser.py
@@ -0,0 +1,94 @@
+"""Use the HTMLParser library to parse HTML files that aren't too bad."""
+
+__all__ = [
+ 'HTMLParserTreeBuilder',
+ ]
+
+from HTMLParser import HTMLParser
+from bs4.element import (
+ CData,
+ Comment,
+ Declaration,
+ Doctype,
+ ProcessingInstruction,
+ )
+from bs4.dammit import EntitySubstitution, UnicodeDammit
+
+from bs4.builder import (
+ HTML,
+ HTMLTreeBuilder,
+ STRICT,
+ )
+
+
+HTMLPARSER = 'html.parser'
+
+class HTMLParserTreeBuilder(HTMLParser, HTMLTreeBuilder):
+
+ is_xml = False
+ features = [HTML, STRICT, HTMLPARSER]
+
+ def prepare_markup(self, markup, user_specified_encoding=None,
+ document_declared_encoding=None):
+ """
+ :return: A 3-tuple (markup, original encoding, encoding
+ declared within markup).
+ """
+ if isinstance(markup, unicode):
+ return markup, None, None
+
+ try_encodings = [user_specified_encoding, document_declared_encoding]
+ dammit = UnicodeDammit(markup, try_encodings, isHTML=True)
+ return (dammit.markup, dammit.original_encoding,
+ dammit.declared_html_encoding)
+
+ def feed(self, markup):
+ super(HTMLParserTreeBuilder, self).feed(markup)
+
+ def handle_starttag(self, name, attrs):
+ self.soup.handle_starttag(name, dict(attrs))
+
+ def handle_endtag(self, name):
+ self.soup.handle_endtag(name)
+
+ def handle_data(self, data):
+ self.soup.handle_data(data)
+
+ def handle_charref(self, name):
+ self.handle_data(unichr(int(name)))
+
+ def handle_entityref(self, name):
+ character = EntitySubstitution.HTML_ENTITY_TO_CHARACTER.get(name)
+ if character is not None:
+ data = character
+ else:
+ data = "&%s;" % name
+ self.handle_data(data)
+
+ def handle_comment(self, data):
+ self.soup.endData()
+ self.soup.handle_data(data)
+ self.soup.endData(Comment)
+
+ def handle_decl(self, data):
+ self.soup.endData()
+ if data.startswith("DOCTYPE "):
+ data = data[len("DOCTYPE "):]
+ self.soup.handle_data(data)
+ self.soup.endData(Doctype)
+
+ def unknown_decl(self, data):
+ if data.upper().startswith('CDATA['):
+ cls = CData
+ data = data[len('CDATA['):]
+ else:
+ cls = Declaration
+ self.soup.endData()
+ self.soup.handle_data(data)
+ self.soup.endData(cls)
+
+ def handle_pi(self, data):
+ self.soup.endData()
+ self.soup.handle_data(data)
+ self.soup.endData(ProcessingInstruction)
+
diff --git a/bs4/dammit.py b/bs4/dammit.py
index 4483118..75d445e 100644
--- a/bs4/dammit.py
+++ b/bs4/dammit.py
@@ -35,6 +35,7 @@ class EntitySubstitution(object):
def _populate_class_variables():
lookup = {}
+ reverse_lookup = {}
characters = []
for codepoint, name in codepoint2name.items():
if codepoint == 34:
@@ -45,10 +46,11 @@ class EntitySubstitution(object):
character = unichr(codepoint)
characters.append(character)
lookup[character] = name
+ reverse_lookup[name] = character
re_definition = "[%s]" % "".join(characters)
- return lookup, re.compile(re_definition)
- CHARACTER_TO_HTML_ENTITY, CHARACTER_TO_HTML_ENTITY_RE = (
- _populate_class_variables())
+ return lookup, reverse_lookup, re.compile(re_definition)
+ (CHARACTER_TO_HTML_ENTITY, HTML_ENTITY_TO_CHARACTER,
+ CHARACTER_TO_HTML_ENTITY_RE) = _populate_class_variables()
CHARACTER_TO_XML_ENTITY = {