diff options
author | Leonard Richardson <leonard.richardson@canonical.com> | 2012-03-01 13:37:42 -0500 |
---|---|---|
committer | Leonard Richardson <leonard.richardson@canonical.com> | 2012-03-01 13:37:42 -0500 |
commit | 76e93195ae2c62cba701bf9452caaea8a48f7833 (patch) | |
tree | bc480fac48cc2e24c14b1a803c77d01482a5f705 | |
parent | 870c2c422aa77bc3cb3a49b39fa8f8b633ec18ad (diff) | |
download | beautifulsoup4-76e93195ae2c62cba701bf9452caaea8a48f7833.tar.gz |
In HTML5-style <meta charset="foo"> tags, the value of the "charset" attribute is now replaced with the appropriate encoding on output. [bug=942714]
-rw-r--r-- | NEWS.txt | 3 | ||||
-rw-r--r-- | bs4/builder/__init__.py | 59 | ||||
-rw-r--r-- | bs4/testing.py | 18 |
3 files changed, 51 insertions, 29 deletions
@@ -2,6 +2,9 @@ * Added support for simple CSS selectors, taken from the soupselect project. +* In HTML5-style <meta charset="foo"> tags, the value of the "charset" + attribute is now replaced with the appropriate encoding on output. + = 4.0.0b9 (20110228) = * Fixed the string representation of DOCTYPEs that have both a public diff --git a/bs4/builder/__init__.py b/bs4/builder/__init__.py index 4e31572..a38a98f 100644 --- a/bs4/builder/__init__.py +++ b/bs4/builder/__init__.py @@ -135,7 +135,7 @@ class TreeBuilder(object): return fragment def set_up_substitutions(self, tag): - pass + return False class SAXTreeBuilder(TreeBuilder): @@ -222,41 +222,42 @@ class HTMLTreeBuilder(TreeBuilder): CHARSET_RE = re.compile("((^|;)\s*charset=)([^;]*)", re.M) def set_up_substitutions(self, tag): + # We are only interested in <meta> tags if tag.name != 'meta': return False http_equiv = tag.get('http-equiv') content = tag.get('content') - - if (http_equiv is not None - and content is not None - and http_equiv.lower() == 'content-type'): - # This is an interesting meta tag. + charset = tag.get('charset') + + # We are interested in <meta> tags that say what encoding the + # document was originally in. This means HTML 5-style <meta> + # tags that provide the "charset" attribute. It also means + # HTML 4-style <meta> tags that provide the "content" + # attribute and have "http-equiv" set to "content-type". + meta_encoding = None + if charset is not None: + # HTML 5 style: + # <meta charset="utf8"> + meta_encoding = charset + + # Modify the tag. + tag['charset'] = "%SOUP-ENCODING%" + + elif (content is not None and http_equiv is not None + and http_equiv.lower() == 'content-type'): + # HTML 4 style: + # <meta http-equiv="content-type" content="text/html; charset=utf8"> match = self.CHARSET_RE.search(content) - if match: - if (self.soup.declared_html_encoding is not None or - self.soup.original_encoding == self.soup.from_encoding): - # An HTML encoding was sniffed while converting - # the document to Unicode, or an HTML encoding was - # sniffed during a previous pass through the - # document, or an encoding was specified - # explicitly and it worked. Rewrite the meta tag. - def rewrite(match): - return match.group(1) + "%SOUP-ENCODING%" - tag['content'] = self.CHARSET_RE.sub(rewrite, content) - return True - else: - # This is our first pass through the document. - # Go through it again with the encoding information. - new_charset = match.group(3) - if (new_charset is not None - and new_charset != self.soup.original_encoding): - self.soup.declared_html_encoding = new_charset - self.soup._feed(self.soup.declared_html_encoding) - raise StopParsing - pass - return False + if match is not None: + meta_encoding = match.group(3) + + # Modify the tag. + def rewrite(match): + return match.group(1) + "%SOUP-ENCODING%" + tag['content'] = self.CHARSET_RE.sub(rewrite, content) + return (meta_encoding is not None) def register_treebuilders_from(module): """Copy TreeBuilders from the given module into this module.""" diff --git a/bs4/testing.py b/bs4/testing.py index a3e0b38..1b73160 100644 --- a/bs4/testing.py +++ b/bs4/testing.py @@ -368,6 +368,24 @@ class HTMLTreeBuilderSmokeTest(object): # For the rest of the story, see TestSubstitutions in # test_tree.py. + def test_html5_style_meta_tag_reflects_current_encoding(self): + # Here's the <meta> tag saying that a document is + # encoded in Shift-JIS. + meta_tag = ('<meta id="encoding" charset="x-sjis" />') + + # Here's a document incorporating that meta tag. + shift_jis_html = ( + '<html><head>\n%s\n' + '<meta http-equiv="Content-language" content="ja"/>' + '</head><body>Shift-JIS markup goes here.') % meta_tag + soup = self.soup(shift_jis_html) + + # Parse the document, and the charset is replaced with a + # generic value. + parsed_meta = soup.find('meta', id="encoding") + self.assertEqual('%SOUP-ENCODING%', parsed_meta['charset']) + self.assertEqual(True, parsed_meta.contains_substitutions) + class XMLTreeBuilderSmokeTest(object): def test_docstring_generated(self): |