diff options
author | Leonard Richardson <leonard.richardson@canonical.com> | 2011-02-27 18:08:59 -0500 |
---|---|---|
committer | Leonard Richardson <leonard.richardson@canonical.com> | 2011-02-27 18:08:59 -0500 |
commit | 3156a689a566966079bba7fb19497314e9184b94 (patch) | |
tree | 463a004e531b553f9cfb1ab58507392b99fe5efa /bs4 | |
parent | 8c7a895ff8cfc357543966137a7f71e48a9ea02d (diff) | |
download | beautifulsoup4-3156a689a566966079bba7fb19497314e9184b94.tar.gz |
Added a tree builder for the built-in HTMLParser, and tests.
Diffstat (limited to 'bs4')
-rw-r--r-- | bs4/builder/__init__.py | 6 | ||||
-rw-r--r-- | bs4/builder/_htmlparser.py | 94 | ||||
-rw-r--r-- | bs4/dammit.py | 8 |
3 files changed, 104 insertions, 4 deletions
diff --git a/bs4/builder/__init__.py b/bs4/builder/__init__.py index 10c6b7f..17dcff3 100644 --- a/bs4/builder/__init__.py +++ b/bs4/builder/__init__.py @@ -12,6 +12,7 @@ __all__ = [ # Some useful features for a TreeBuilder to have. FAST = 'fast' PERMISSIVE = 'permissive' +STRICT = 'strict' XML = 'xml' HTML = 'html' HTML_5 = 'html5' @@ -244,7 +245,10 @@ def register_treebuilders_from(module): # Builders are registered in reverse order of priority, so that custom # builder registrations will take precedence. In general, we want -# html5lib to take precedence over lxml, because it's more reliable. +# html5lib to take precedence over lxml, because it's more +# reliable. And we only want to use HTMLParser as a last result. +import _htmlparser +register_treebuilders_from(_htmlparser) try: import _lxml register_treebuilders_from(_lxml) diff --git a/bs4/builder/_htmlparser.py b/bs4/builder/_htmlparser.py new file mode 100644 index 0000000..c293d9e --- /dev/null +++ b/bs4/builder/_htmlparser.py @@ -0,0 +1,94 @@ +"""Use the HTMLParser library to parse HTML files that aren't too bad.""" + +__all__ = [ + 'HTMLParserTreeBuilder', + ] + +from HTMLParser import HTMLParser +from bs4.element import ( + CData, + Comment, + Declaration, + Doctype, + ProcessingInstruction, + ) +from bs4.dammit import EntitySubstitution, UnicodeDammit + +from bs4.builder import ( + HTML, + HTMLTreeBuilder, + STRICT, + ) + + +HTMLPARSER = 'html.parser' + +class HTMLParserTreeBuilder(HTMLParser, HTMLTreeBuilder): + + is_xml = False + features = [HTML, STRICT, HTMLPARSER] + + def prepare_markup(self, markup, user_specified_encoding=None, + document_declared_encoding=None): + """ + :return: A 3-tuple (markup, original encoding, encoding + declared within markup). + """ + if isinstance(markup, unicode): + return markup, None, None + + try_encodings = [user_specified_encoding, document_declared_encoding] + dammit = UnicodeDammit(markup, try_encodings, isHTML=True) + return (dammit.markup, dammit.original_encoding, + dammit.declared_html_encoding) + + def feed(self, markup): + super(HTMLParserTreeBuilder, self).feed(markup) + + def handle_starttag(self, name, attrs): + self.soup.handle_starttag(name, dict(attrs)) + + def handle_endtag(self, name): + self.soup.handle_endtag(name) + + def handle_data(self, data): + self.soup.handle_data(data) + + def handle_charref(self, name): + self.handle_data(unichr(int(name))) + + def handle_entityref(self, name): + character = EntitySubstitution.HTML_ENTITY_TO_CHARACTER.get(name) + if character is not None: + data = character + else: + data = "&%s;" % name + self.handle_data(data) + + def handle_comment(self, data): + self.soup.endData() + self.soup.handle_data(data) + self.soup.endData(Comment) + + def handle_decl(self, data): + self.soup.endData() + if data.startswith("DOCTYPE "): + data = data[len("DOCTYPE "):] + self.soup.handle_data(data) + self.soup.endData(Doctype) + + def unknown_decl(self, data): + if data.upper().startswith('CDATA['): + cls = CData + data = data[len('CDATA['):] + else: + cls = Declaration + self.soup.endData() + self.soup.handle_data(data) + self.soup.endData(cls) + + def handle_pi(self, data): + self.soup.endData() + self.soup.handle_data(data) + self.soup.endData(ProcessingInstruction) + diff --git a/bs4/dammit.py b/bs4/dammit.py index 4483118..75d445e 100644 --- a/bs4/dammit.py +++ b/bs4/dammit.py @@ -35,6 +35,7 @@ class EntitySubstitution(object): def _populate_class_variables(): lookup = {} + reverse_lookup = {} characters = [] for codepoint, name in codepoint2name.items(): if codepoint == 34: @@ -45,10 +46,11 @@ class EntitySubstitution(object): character = unichr(codepoint) characters.append(character) lookup[character] = name + reverse_lookup[name] = character re_definition = "[%s]" % "".join(characters) - return lookup, re.compile(re_definition) - CHARACTER_TO_HTML_ENTITY, CHARACTER_TO_HTML_ENTITY_RE = ( - _populate_class_variables()) + return lookup, reverse_lookup, re.compile(re_definition) + (CHARACTER_TO_HTML_ENTITY, HTML_ENTITY_TO_CHARACTER, + CHARACTER_TO_HTML_ENTITY_RE) = _populate_class_variables() CHARACTER_TO_XML_ENTITY = { |