In HTML5-style <meta charset="foo"> tags, the value of the "charset" attribute is now replaced with the appropriate encoding on output. [bug=942714]

author: Leonard Richardson <leonard.richardson@canonical.com> 2012-03-01 13:37:42 -0500
committer: Leonard Richardson <leonard.richardson@canonical.com> 2012-03-01 13:37:42 -0500
commit: 76e93195ae2c62cba701bf9452caaea8a48f7833 (patch)
tree: bc480fac48cc2e24c14b1a803c77d01482a5f705
parent: 870c2c422aa77bc3cb3a49b39fa8f8b633ec18ad (diff)
download: beautifulsoup4-76e93195ae2c62cba701bf9452caaea8a48f7833.tar.gz
3 files changed, 51 insertions, 29 deletions
diff --git a/NEWS.txt b/NEWS.txt
index 944c677..a1e650b 100644
--- a/NEWS.txt
+++ b/NEWS.txt
@@ -2,6 +2,9 @@
 
 * Added support for simple CSS selectors, taken from the soupselect project.
 
+* In HTML5-style <meta charset="foo"> tags, the value of the "charset"
+  attribute is now replaced with the appropriate encoding on output.
+
 = 4.0.0b9 (20110228) =
 
 * Fixed the string representation of DOCTYPEs that have both a public
diff --git a/bs4/builder/__init__.py b/bs4/builder/__init__.py
index 4e31572..a38a98f 100644
--- a/bs4/builder/__init__.py
+++ b/bs4/builder/__init__.py
@@ -135,7 +135,7 @@ class TreeBuilder(object):
         return fragment
 
     def set_up_substitutions(self, tag):
-        pass
+        return False
 
 
 class SAXTreeBuilder(TreeBuilder):
@@ -222,41 +222,42 @@ class HTMLTreeBuilder(TreeBuilder):
     CHARSET_RE = re.compile("((^|;)\s*charset=)([^;]*)", re.M)
 
     def set_up_substitutions(self, tag):
+        # We are only interested in <meta> tags
         if tag.name != 'meta':
             return False
 
         http_equiv = tag.get('http-equiv')
         content = tag.get('content')
-
-        if (http_equiv is not None
-            and content is not None
-            and http_equiv.lower() == 'content-type'):
-            # This is an interesting meta tag.
+        charset = tag.get('charset')
+
+        # We are interested in <meta> tags that say what encoding the
+        # document was originally in. This means HTML 5-style <meta>
+        # tags that provide the "charset" attribute. It also means
+        # HTML 4-style <meta> tags that provide the "content"
+        # attribute and have "http-equiv" set to "content-type".
+        meta_encoding = None
+        if charset is not None:
+            # HTML 5 style:
+            # <meta charset="utf8">
+            meta_encoding = charset
+
+            # Modify the tag.
+            tag['charset'] = "%SOUP-ENCODING%"
+
+        elif (content is not None and http_equiv is not None
+              and http_equiv.lower() == 'content-type'):
+            # HTML 4 style:
+            # <meta http-equiv="content-type" content="text/html; charset=utf8">
             match = self.CHARSET_RE.search(content)
-            if match:
-                if (self.soup.declared_html_encoding is not None or
-                    self.soup.original_encoding == self.soup.from_encoding):
-                    # An HTML encoding was sniffed while converting
-                    # the document to Unicode, or an HTML encoding was
-                    # sniffed during a previous pass through the
-                    # document, or an encoding was specified
-                    # explicitly and it worked. Rewrite the meta tag.
-                    def rewrite(match):
-                        return match.group(1) + "%SOUP-ENCODING%"
-                    tag['content'] = self.CHARSET_RE.sub(rewrite, content)
-                    return True
-                else:
-                    # This is our first pass through the document.
-                    # Go through it again with the encoding information.
-                    new_charset = match.group(3)
-                    if (new_charset is not None
-                        and new_charset != self.soup.original_encoding):
-                        self.soup.declared_html_encoding = new_charset
-                        self.soup._feed(self.soup.declared_html_encoding)
-                        raise StopParsing
-                    pass
-        return False
+            if match is not None:
+                meta_encoding = match.group(3)
+
+                # Modify the tag.
+                def rewrite(match):
+                    return match.group(1) + "%SOUP-ENCODING%"
+                tag['content'] = self.CHARSET_RE.sub(rewrite, content)
 
+        return (meta_encoding is not None)
 
 def register_treebuilders_from(module):
     """Copy TreeBuilders from the given module into this module."""
diff --git a/bs4/testing.py b/bs4/testing.py
index a3e0b38..1b73160 100644
--- a/bs4/testing.py
+++ b/bs4/testing.py
@@ -368,6 +368,24 @@ class HTMLTreeBuilderSmokeTest(object):
         # For the rest of the story, see TestSubstitutions in
         # test_tree.py.
 
+    def test_html5_style_meta_tag_reflects_current_encoding(self):
+        # Here's the <meta> tag saying that a document is
+        # encoded in Shift-JIS.
+        meta_tag = ('<meta id="encoding" charset="x-sjis" />')
+
+        # Here's a document incorporating that meta tag.
+        shift_jis_html = (
+            '<html><head>\n%s\n'
+            '<meta http-equiv="Content-language" content="ja"/>'
+            '</head><body>Shift-JIS markup goes here.') % meta_tag
+        soup = self.soup(shift_jis_html)
+
+        # Parse the document, and the charset is replaced with a
+        # generic value.
+        parsed_meta = soup.find('meta', id="encoding")
+        self.assertEqual('%SOUP-ENCODING%', parsed_meta['charset'])
+        self.assertEqual(True, parsed_meta.contains_substitutions)
+
 class XMLTreeBuilderSmokeTest(object):
 
     def test_docstring_generated(self):
author	Leonard Richardson <leonard.richardson@canonical.com>	2012-03-01 13:37:42 -0500
committer	Leonard Richardson <leonard.richardson@canonical.com>	2012-03-01 13:37:42 -0500
commit	76e93195ae2c62cba701bf9452caaea8a48f7833 (patch)
tree	bc480fac48cc2e24c14b1a803c77d01482a5f705
parent	870c2c422aa77bc3cb3a49b39fa8f8b633ec18ad (diff)
download	beautifulsoup4-76e93195ae2c62cba701bf9452caaea8a48f7833.tar.gz