summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorLeonard Richardson <leonard.richardson@canonical.com>2012-03-01 13:37:42 -0500
committerLeonard Richardson <leonard.richardson@canonical.com>2012-03-01 13:37:42 -0500
commit76e93195ae2c62cba701bf9452caaea8a48f7833 (patch)
treebc480fac48cc2e24c14b1a803c77d01482a5f705
parent870c2c422aa77bc3cb3a49b39fa8f8b633ec18ad (diff)
downloadbeautifulsoup4-76e93195ae2c62cba701bf9452caaea8a48f7833.tar.gz
In HTML5-style <meta charset="foo"> tags, the value of the "charset" attribute is now replaced with the appropriate encoding on output. [bug=942714]
-rw-r--r--NEWS.txt3
-rw-r--r--bs4/builder/__init__.py59
-rw-r--r--bs4/testing.py18
3 files changed, 51 insertions, 29 deletions
diff --git a/NEWS.txt b/NEWS.txt
index 944c677..a1e650b 100644
--- a/NEWS.txt
+++ b/NEWS.txt
@@ -2,6 +2,9 @@
* Added support for simple CSS selectors, taken from the soupselect project.
+* In HTML5-style <meta charset="foo"> tags, the value of the "charset"
+ attribute is now replaced with the appropriate encoding on output.
+
= 4.0.0b9 (20110228) =
* Fixed the string representation of DOCTYPEs that have both a public
diff --git a/bs4/builder/__init__.py b/bs4/builder/__init__.py
index 4e31572..a38a98f 100644
--- a/bs4/builder/__init__.py
+++ b/bs4/builder/__init__.py
@@ -135,7 +135,7 @@ class TreeBuilder(object):
return fragment
def set_up_substitutions(self, tag):
- pass
+ return False
class SAXTreeBuilder(TreeBuilder):
@@ -222,41 +222,42 @@ class HTMLTreeBuilder(TreeBuilder):
CHARSET_RE = re.compile("((^|;)\s*charset=)([^;]*)", re.M)
def set_up_substitutions(self, tag):
+ # We are only interested in <meta> tags
if tag.name != 'meta':
return False
http_equiv = tag.get('http-equiv')
content = tag.get('content')
-
- if (http_equiv is not None
- and content is not None
- and http_equiv.lower() == 'content-type'):
- # This is an interesting meta tag.
+ charset = tag.get('charset')
+
+ # We are interested in <meta> tags that say what encoding the
+ # document was originally in. This means HTML 5-style <meta>
+ # tags that provide the "charset" attribute. It also means
+ # HTML 4-style <meta> tags that provide the "content"
+ # attribute and have "http-equiv" set to "content-type".
+ meta_encoding = None
+ if charset is not None:
+ # HTML 5 style:
+ # <meta charset="utf8">
+ meta_encoding = charset
+
+ # Modify the tag.
+ tag['charset'] = "%SOUP-ENCODING%"
+
+ elif (content is not None and http_equiv is not None
+ and http_equiv.lower() == 'content-type'):
+ # HTML 4 style:
+ # <meta http-equiv="content-type" content="text/html; charset=utf8">
match = self.CHARSET_RE.search(content)
- if match:
- if (self.soup.declared_html_encoding is not None or
- self.soup.original_encoding == self.soup.from_encoding):
- # An HTML encoding was sniffed while converting
- # the document to Unicode, or an HTML encoding was
- # sniffed during a previous pass through the
- # document, or an encoding was specified
- # explicitly and it worked. Rewrite the meta tag.
- def rewrite(match):
- return match.group(1) + "%SOUP-ENCODING%"
- tag['content'] = self.CHARSET_RE.sub(rewrite, content)
- return True
- else:
- # This is our first pass through the document.
- # Go through it again with the encoding information.
- new_charset = match.group(3)
- if (new_charset is not None
- and new_charset != self.soup.original_encoding):
- self.soup.declared_html_encoding = new_charset
- self.soup._feed(self.soup.declared_html_encoding)
- raise StopParsing
- pass
- return False
+ if match is not None:
+ meta_encoding = match.group(3)
+
+ # Modify the tag.
+ def rewrite(match):
+ return match.group(1) + "%SOUP-ENCODING%"
+ tag['content'] = self.CHARSET_RE.sub(rewrite, content)
+ return (meta_encoding is not None)
def register_treebuilders_from(module):
"""Copy TreeBuilders from the given module into this module."""
diff --git a/bs4/testing.py b/bs4/testing.py
index a3e0b38..1b73160 100644
--- a/bs4/testing.py
+++ b/bs4/testing.py
@@ -368,6 +368,24 @@ class HTMLTreeBuilderSmokeTest(object):
# For the rest of the story, see TestSubstitutions in
# test_tree.py.
+ def test_html5_style_meta_tag_reflects_current_encoding(self):
+ # Here's the <meta> tag saying that a document is
+ # encoded in Shift-JIS.
+ meta_tag = ('<meta id="encoding" charset="x-sjis" />')
+
+ # Here's a document incorporating that meta tag.
+ shift_jis_html = (
+ '<html><head>\n%s\n'
+ '<meta http-equiv="Content-language" content="ja"/>'
+ '</head><body>Shift-JIS markup goes here.') % meta_tag
+ soup = self.soup(shift_jis_html)
+
+ # Parse the document, and the charset is replaced with a
+ # generic value.
+ parsed_meta = soup.find('meta', id="encoding")
+ self.assertEqual('%SOUP-ENCODING%', parsed_meta['charset'])
+ self.assertEqual(True, parsed_meta.contains_substitutions)
+
class XMLTreeBuilderSmokeTest(object):
def test_docstring_generated(self):