diff options
| author | georg.brandl <devnull@localhost> | 2008-08-04 17:01:15 +0000 |
|---|---|---|
| committer | georg.brandl <devnull@localhost> | 2008-08-04 17:01:15 +0000 |
| commit | be303fd902f82be6ac2fcbeed1ceb0d8af2d49f7 (patch) | |
| tree | 3c9c0a3fe45cdac203e9504fdc00047629d0e1b1 /tests/etree13/HTMLTreeBuilder.py | |
| parent | f7799ac123341d00a9492baa66ee884fc3f4de53 (diff) | |
| download | sphinx-be303fd902f82be6ac2fcbeed1ceb0d8af2d49f7.tar.gz | |
Merged revisions 65283,65303,65316-65317,65372-65375,65377,65380,65483-65485,65494 via svnmerge from
svn+ssh://pythondev@svn.python.org/doctools/branches/0.4.x
........
r65283 | georg.brandl | 2008-07-29 10:07:26 +0000 (Tue, 29 Jul 2008) | 2 lines
Update ez_setup.py.
........
r65303 | benjamin.peterson | 2008-07-30 12:35:34 +0000 (Wed, 30 Jul 2008) | 1 line
add a with_testapp decorator for test functions that passes the TestApp instance in a cleans up after it
........
r65316 | benjamin.peterson | 2008-07-30 23:12:07 +0000 (Wed, 30 Jul 2008) | 1 line
make the app for test_markup global to the module
........
r65317 | benjamin.peterson | 2008-07-30 23:31:29 +0000 (Wed, 30 Jul 2008) | 1 line
make TestApp.cleanup more aggressive
........
r65372 | georg.brandl | 2008-08-01 19:11:22 +0000 (Fri, 01 Aug 2008) | 2 lines
Add more tests, fix a few bugs in image handling.
........
r65373 | georg.brandl | 2008-08-01 19:28:33 +0000 (Fri, 01 Aug 2008) | 2 lines
Fix oversight.
........
r65374 | benjamin.peterson | 2008-08-01 19:36:32 +0000 (Fri, 01 Aug 2008) | 1 line
fix one broken test
........
r65375 | georg.brandl | 2008-08-01 19:41:11 +0000 (Fri, 01 Aug 2008) | 2 lines
Fix the handling of non-ASCII input in quickstart.
........
r65377 | georg.brandl | 2008-08-01 19:48:24 +0000 (Fri, 01 Aug 2008) | 2 lines
Allow REs in markup checks.
........
r65380 | georg.brandl | 2008-08-01 20:31:18 +0000 (Fri, 01 Aug 2008) | 2 lines
Don't rely on mtimes being different for changed files.
........
r65483 | georg.brandl | 2008-08-04 09:01:40 +0000 (Mon, 04 Aug 2008) | 4 lines
Add an "encoding" option to literalinclude.
Add tests for include directives.
........
r65484 | georg.brandl | 2008-08-04 09:11:17 +0000 (Mon, 04 Aug 2008) | 2 lines
Add changelog entry.
........
r65485 | georg.brandl | 2008-08-04 09:21:58 +0000 (Mon, 04 Aug 2008) | 2 lines
Fix markup.
........
r65494 | georg.brandl | 2008-08-04 16:34:59 +0000 (Mon, 04 Aug 2008) | 2 lines
Correctly use HTML file suffix in templates.
........
Diffstat (limited to 'tests/etree13/HTMLTreeBuilder.py')
| -rw-r--r-- | tests/etree13/HTMLTreeBuilder.py | 230 |
1 files changed, 230 insertions, 0 deletions
diff --git a/tests/etree13/HTMLTreeBuilder.py b/tests/etree13/HTMLTreeBuilder.py new file mode 100644 index 00000000..4c5a24f6 --- /dev/null +++ b/tests/etree13/HTMLTreeBuilder.py @@ -0,0 +1,230 @@ +# +# ElementTree +# $Id$ +# +# a simple tree builder, for HTML input +# +# history: +# 2002-04-06 fl created +# 2002-04-07 fl ignore IMG and HR end tags +# 2002-04-07 fl added support for 1.5.2 and later +# 2003-04-13 fl added HTMLTreeBuilder alias +# 2004-12-02 fl don't feed non-ASCII charrefs/entities as 8-bit strings +# 2004-12-05 fl don't feed non-ASCII CDATA as 8-bit strings +# +# Copyright (c) 1999-2004 by Fredrik Lundh. All rights reserved. +# +# fredrik@pythonware.com +# http://www.pythonware.com +# +# -------------------------------------------------------------------- +# The ElementTree toolkit is +# +# Copyright (c) 1999-2007 by Fredrik Lundh +# +# By obtaining, using, and/or copying this software and/or its +# associated documentation, you agree that you have read, understood, +# and will comply with the following terms and conditions: +# +# Permission to use, copy, modify, and distribute this software and +# its associated documentation for any purpose and without fee is +# hereby granted, provided that the above copyright notice appears in +# all copies, and that both that copyright notice and this permission +# notice appear in supporting documentation, and that the name of +# Secret Labs AB or the author not be used in advertising or publicity +# pertaining to distribution of the software without specific, written +# prior permission. +# +# SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD +# TO THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANT- +# ABILITY AND FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR +# BE LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY +# DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, +# WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS +# ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE +# OF THIS SOFTWARE. +# -------------------------------------------------------------------- + +## +# Tools to build element trees from HTML files. +## + +import htmlentitydefs +import re, string, sys +import mimetools, StringIO + +import ElementTree + +AUTOCLOSE = "p", "li", "tr", "th", "td", "head", "body" +IGNOREEND = "img", "hr", "meta", "link", "br" + +if sys.version[:3] == "1.5": + is_not_ascii = re.compile(r"[\x80-\xff]").search # 1.5.2 +else: + is_not_ascii = re.compile(eval(r'u"[\u0080-\uffff]"')).search + +try: + from HTMLParser import HTMLParser +except ImportError: + from sgmllib import SGMLParser + # hack to use sgmllib's SGMLParser to emulate 2.2's HTMLParser + class HTMLParser(SGMLParser): + # the following only works as long as this class doesn't + # provide any do, start, or end handlers + def unknown_starttag(self, tag, attrs): + self.handle_starttag(tag, attrs) + def unknown_endtag(self, tag): + self.handle_endtag(tag) + +## +# ElementTree builder for HTML source code. This builder converts an +# HTML document or fragment to an ElementTree. +# <p> +# The parser is relatively picky, and requires balanced tags for most +# elements. However, elements belonging to the following group are +# automatically closed: P, LI, TR, TH, and TD. In addition, the +# parser automatically inserts end tags immediately after the start +# tag, and ignores any end tags for the following group: IMG, HR, +# META, and LINK. +# +# @keyparam builder Optional builder object. If omitted, the parser +# uses the standard <b>elementtree</b> builder. +# @keyparam encoding Optional character encoding, if known. If omitted, +# the parser looks for META tags inside the document. If no tags +# are found, the parser defaults to ISO-8859-1. Note that if your +# document uses a non-ASCII compatible encoding, you must decode +# the document before parsing. +# +# @see elementtree.ElementTree + +class HTMLTreeBuilder(HTMLParser): + + # FIXME: shouldn't this class be named Parser, not Builder? + + def __init__(self, builder=None, encoding=None): + self.__stack = [] + if builder is None: + builder = ElementTree.TreeBuilder() + self.__builder = builder + self.encoding = encoding or "iso-8859-1" + HTMLParser.__init__(self) + + ## + # Flushes parser buffers, and return the root element. + # + # @return An Element instance. + + def close(self): + HTMLParser.close(self) + return self.__builder.close() + + ## + # (Internal) Handles start tags. + + def handle_starttag(self, tag, attrs): + if tag == "meta": + # look for encoding directives + http_equiv = content = None + for k, v in attrs: + if k == "http-equiv": + http_equiv = string.lower(v) + elif k == "content": + content = v + if http_equiv == "content-type" and content: + # use mimetools to parse the http header + header = mimetools.Message( + StringIO.StringIO("%s: %s\n\n" % (http_equiv, content)) + ) + encoding = header.getparam("charset") + if encoding: + self.encoding = encoding + if tag in AUTOCLOSE: + if self.__stack and self.__stack[-1] == tag: + self.handle_endtag(tag) + self.__stack.append(tag) + attrib = {} + if attrs: + for k, v in attrs: + attrib[string.lower(k)] = v + self.__builder.start(tag, attrib) + if tag in IGNOREEND: + self.__stack.pop() + self.__builder.end(tag) + + ## + # (Internal) Handles end tags. + + def handle_endtag(self, tag): + if tag in IGNOREEND: + return + lasttag = self.__stack.pop() + if tag != lasttag and lasttag in AUTOCLOSE: + self.handle_endtag(lasttag) + self.__builder.end(tag) + + ## + # (Internal) Handles character references. + + def handle_charref(self, char): + if char[:1] == "x": + char = int(char[1:], 16) + else: + char = int(char) + if 0 <= char < 128: + self.__builder.data(chr(char)) + else: + self.__builder.data(unichr(char)) + + ## + # (Internal) Handles entity references. + + def handle_entityref(self, name): + entity = htmlentitydefs.entitydefs.get(name) + if entity: + if len(entity) == 1: + entity = ord(entity) + else: + entity = int(entity[2:-1]) + if 0 <= entity < 128: + self.__builder.data(chr(entity)) + else: + self.__builder.data(unichr(entity)) + else: + self.unknown_entityref(name) + + ## + # (Internal) Handles character data. + + def handle_data(self, data): + if isinstance(data, type('')) and is_not_ascii(data): + # convert to unicode, but only if necessary + data = unicode(data, self.encoding, "ignore") + self.__builder.data(data) + + ## + # (Hook) Handles unknown entity references. The default action + # is to ignore unknown entities. + + def unknown_entityref(self, name): + pass # ignore by default; override if necessary + +## +# An alias for the <b>HTMLTreeBuilder</b> class. + +TreeBuilder = HTMLTreeBuilder + +## +# Parse an HTML document or document fragment. +# +# @param source A filename or file object containing HTML data. +# @param encoding Optional character encoding, if known. If omitted, +# the parser looks for META tags inside the document. If no tags +# are found, the parser defaults to ISO-8859-1. +# @return An ElementTree instance + +def parse(source, encoding=None): + return ElementTree.parse(source, HTMLTreeBuilder(encoding=encoding)) + +if __name__ == "__main__": + import sys + ElementTree.dump(parse(open(sys.argv[1]))) |
