diff options
author | Leonard Richardson <leonard.richardson@canonical.com> | 2012-02-22 13:19:06 -0500 |
---|---|---|
committer | Leonard Richardson <leonard.richardson@canonical.com> | 2012-02-22 13:19:06 -0500 |
commit | 1d59c72089bd9c4c00da44e0268e3c6af05613f3 (patch) | |
tree | 50b81b9d2893aa02d3d6fc53b40b94fb71d0eafb | |
parent | 88adcd5e18e1607ebbefe9677ceda400e6b76037 (diff) | |
download | beautifulsoup4-1d59c72089bd9c4c00da44e0268e3c6af05613f3.tar.gz |
Added scripts.
-rw-r--r-- | scripts/demo_differences.py | 55 | ||||
-rw-r--r-- | scripts/differences.txt | 34 |
2 files changed, 89 insertions, 0 deletions
diff --git a/scripts/demo_differences.py b/scripts/demo_differences.py new file mode 100644 index 0000000..c544ea1 --- /dev/null +++ b/scripts/demo_differences.py @@ -0,0 +1,55 @@ +from bs4 import BeautifulSoup + +different_results = [] +uniform_results = [] + +class Demonstration(object): + def __init__(self, markup): + self.results = {} + self.markup = markup + + def run_against(self, *parser_names): + uniform_results = True + previous_output = None + for parser in parser_names: + try: + soup = BeautifulSoup(self.markup, parser) + if markup.startswith("<div>"): + # Extract the interesting part + output = soup.div + else: + output = soup + except Exception, e: + output = "[EXCEPTION] %s" % str(e) + self.results[parser] = output + if previous_output is None: + previous_output = output + elif previous_output != output: + uniform_results = False + return uniform_results + + def dump(self): + print "%s: %s" % ("Markup".rjust(13), self.markup.encode("utf8")) + for parser, output in self.results.items(): + print "%s: %s" % (parser.rjust(13), output.encode("utf8")) + + +for markup in open("differences.txt"): + demo = Demonstration(markup.decode("utf8").strip().replace("\\n", "\n")) + is_uniform = demo.run_against("html.parser", "lxml", "html5lib") + if is_uniform: + uniform_results.append(demo) + else: + different_results.append(demo) + +print "Markup that's handled the same in every parser:" +for demo in uniform_results: + demo.dump() + print "-" * 80 +print +print "=" * 80 +print +print "Markup that's not handled the same in every parser:" +for demo in different_results: + demo.dump() + print "-" * 80 diff --git a/scripts/differences.txt b/scripts/differences.txt new file mode 100644 index 0000000..a7914a0 --- /dev/null +++ b/scripts/differences.txt @@ -0,0 +1,34 @@ +A bare string +<!DOCTYPE xsl:stylesheet SYSTEM "htmlent.dtd"> +<!DOCTYPE xsl:stylesheet PUBLIC "htmlent.dtd"> +<div><![CDATA[A CDATA section where it doesn't belong]]></div> +<div><svg><![CDATA[HTML5 does allow CDATA sections in SVG]]></svg></div> +<div>A <meta> tag</div> +<div>A <br> tag that supposedly has contents.</br></div> +<div>AT&T</div> +<div><textarea>Within a textarea, markup like <b> tags and <&<& should be treated as literal</textarea></div> +<div><script>if (i < 2) { alert("<b>Markup within script tags should be treated as literal.</b>"); }</script></div> +<div>This numeric entity is missing the final semicolon: <x t="piñata"></div> +<div><a href="http://example.com/</a> that attribute value never got closed</div> +<div><a href="foo</a>, </a><a href="bar">that attribute value was closed by the subsequent tag</a></div> +<! This document starts with a bogus declaration ><div>a</div> +<div>This document contains <!an incomplete declaration <div>(do you see it?)</div> +<div>This document ends with <!an incomplete declaration +<div><a style={height:21px;}>That attribute value was bogus</a></div> +<! DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN">The doctype is invalid because it contains extra whitespace +<div><table><td nowrap>That boolean attribute had no value</td></table></div> +<div>Here's a nonexistent entity: &#foo; (do you see it?)</div> +<div>This document ends before the entity finishes: > +<div><p>Paragraphs shouldn't contain block display elements, but this one does: <dl><dt>you see?</dt></p> +<b b="20" a="1" b="10" a="2" a="3" a="4">Multiple values for the same attribute.</b> +<div><table><tr><td>Here's a table</td></tr></table></div> +<div><table id="1"><tr><td>Here's a nested table:<table id="2"><tr><td>foo</td></tr></table></td></div> +<div>This tag contains nothing but whitespace: <b> </b></div> +<div><blockquote><p><b>This p tag is cut off by</blockquote></p>the end of the blockquote tag</div> +<div><table><div>This table contains bare markup</div></table></div> +<div><div id="1">\n <a href="link1">This link is never closed.\n</div>\n<div id="2">\n <div id="3">\n <a href="link2">This link is closed.</a>\n </div>\n</div></div> +<div>This document contains a <!DOCTYPE surprise>surprise doctype</div> +<div><a><B><Cd><EFG>Mixed case tags are folded to lowercase</efg></CD></b></A></div> +<div><our☃>Tag name contains Unicode characters</our☃></div> +<div><a ☃="snowman">Attribute name contains Unicode characters</a></div> +<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd"> |